Read data

This data was read from lacZ sequencing data compiled across multiple studies.

snvs <- read_delim("data/raw/SNV_data.txt", delim = "\t")
ins <- read_delim("data/raw/Insertion_data.txt", delim = "\t")
del <- read_delim("data/raw/Deletion_data.txt", delim = "\t")
historical <- read_delim("data/raw/Historical_data.txt", delim = "\t")

snvs$Type <- "SNV"
ins$Type <- "Insertion"
del$Type <- "Deletion"
snvs$Study <- "This Study"
ins$Study <- "This Study"
del$Study <- "This Study"
snvs$Technology <- "NGS"
ins$Technology <- "NGS"
del$Technology <- "NGS"
historical$Technology <- "Sanger"

historical <- historical %>%
  mutate(Type = case_when(
    Deletion == 1 ~ "Deletion",
    Insertion == 1 ~ "Insertion",
    str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "Deletion",
    str_length(Ref) < str_length(Alt) ~ "Insertion",
    str_length(Ref) > str_length(Alt) ~ "Deletion",
    str_length(Ref) > 1 & str_length(Ref) == str_length(Alt) ~ "Complex",
    !str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "SNV"
  )) %>%
  mutate(Codon = as.numeric(Codon))

snvs_ins_del_clean <- snvs %>%
  dplyr::full_join(ins, keep = F, na_matches = "never") %>%
  dplyr::full_join(del, keep = F, na_matches = "never") %>%
  dplyr::full_join(historical,
    keep = F, na_matches = "never"
  ) %>%
  dplyr::mutate(mutation = paste0(Ref, ">", Alt)) %>%
  dplyr::mutate(aa_change = paste0(`Ref A.A.`, ">", `Alt A.A.`)) %>%
  mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
    yes = 0,
    no = 1
  )) %>%
  dplyr::mutate(Ref = str_to_upper(Ref), Alt = str_to_upper(Alt)) %>%
  mutate(codon_position = ((Position - 1) %% 3) + 1)


# Rewrite position as relative to lacZ reference
# Get index of pre-insertion
index <- snvs_ins_del_clean["Position"] < 26
# Transform by subtracting 3
snvs_ins_del_clean$PositionRef[index] <- snvs_ins_del_clean$Position[index] - 3
# Get index post-insertion
index <- snvs_ins_del_clean["Position"] > 40
# Subtract 18
snvs_ins_del_clean$PositionRef[index] <- snvs_ins_del_clean$Position[index] - 18

# Add amino acid 3 letter code
snvs_ins_del_clean <- snvs_ins_del_clean %>%
  dplyr::mutate(residue_code = Biostrings::AMINO_ACID_CODE[
    `Ref A.A.` # this still won't work for multi-codon mutations
  ]) %>%
  dplyr::mutate(alt_code = Biostrings::AMINO_ACID_CODE[
    `Alt A.A.` # this still won't work for multi-codon mutations
  ]) %>%
  dplyr::mutate(CodonRef = ceiling(PositionRef / 3))

# Also add complete residue name
snvs_ins_del_clean$residue_name <- paste(snvs_ins_del_clean$residue_code, snvs_ins_del_clean$Codon, sep = "")

# Add domain information
# Sugar Binding (49-219; PF02837)
# β-Galactosidase (221-334; PF00703)
# TIM Barrel (336-630; PF02836)
# β-Galactosidase Small Chain (749-1022; PF02929)
# domain_breakpoints_nuc <- c(49*3, 219*3, 221*3, 334*3, 336*3, 630*3, 749*3, 1022*3)
d1 <- dplyr::between(snvs_ins_del_clean$Codon, 49, 219)
d2 <- dplyr::between(snvs_ins_del_clean$Codon, 221, 334)
d3 <- dplyr::between(snvs_ins_del_clean$Codon, 336, 630)
d4 <- dplyr::between(snvs_ins_del_clean$Codon, 749, 1022)
snvs_ins_del_clean$Domain[d1] <- "Sugar Binding (PF02837)"
snvs_ins_del_clean$Domain[d2] <- "β-Galactosidase (PF00703)"
snvs_ins_del_clean$Domain[d3] <- "TIM Barrel (PF02836)"
snvs_ins_del_clean$Domain[d4] <- "β-Galactosidase Small Chain (PF02929)"

snvs_ins_del_clean %>%
  dplyr::group_by(Domain) %>%
  tally()
## # A tibble: 5 × 2
##   Domain                                    n
##   <chr>                                 <int>
## 1 Sugar Binding (PF02837)                 901
## 2 TIM Barrel (PF02836)                   2124
## 3 β-Galactosidase (PF00703)               384
## 4 β-Galactosidase Small Chain (PF02929)  1401
## 5 <NA>                                   1655
# Domain                                    n
# 1 Sugar Binding (PF02837)                 901
# 2 TIM Barrel (PF02836)                   2124
# 3 β-Galactosidase (PF00703)               384
# 4 β-Galactosidase Small Chain (PF02929)  1401
# 5 NA                                     1655

historical_clean <- snvs_ins_del_clean %>%
  dplyr::filter(!Study == "This Study")

duplicated <- snvs_ins_del_clean %>% duplicated()
duplicates <- snvs_ins_del_clean[duplicated, ]

snvs_ins_del_collapsed <- snvs_ins_del_clean %>%
  distinct() # This will collapse duplicated lines, i.e., same mutation but
# from different animals/samples

knitr::kable(snvs_ins_del_clean %>% head())
Exposure Tissue Dose Position Ref Alt Tech Rep1 Tech Rep2 Tech Difference Background Avg Freq Count A:T to G:C G:C to A:T G:C to T:A G:C to C:G A:T to T:A A:T to C:G Insertion Deletion Codon Consequence Ref Codon Alt Codon Ref A.A. Alt A.A. Type Study Technology Tech Diff mutation aa_change FunctionalChange codon_position PositionRef residue_code alt_code CodonRef residue_name Domain
BaP Bone Marrow 100 19 T C 0.0189 0.0261 1.3829 0.0279 0.0225 9 1 0 0 0 0 0 0 0 7 missense TCA CCA S P SNV This Study NGS NA T>C S>P 1 1 16 Ser Pro 6 Ser7 NA
BaP Bone Marrow 100 19 T C 0.0159 0.0222 1.3964 0.0279 0.0190 9 1 0 0 0 0 0 0 0 7 missense TCA CCA S P SNV This Study NGS NA T>C S>P 1 1 16 Ser Pro 6 Ser7 NA
BaP Bone Marrow 100 41 C A 0.0035 0.0038 1.0731 0.0002 0.0036 1 0 0 1 0 0 0 0 0 8 missense CCC CAC P H SNV This Study NGS NA C>A P>H 1 2 23 Pro His 8 Pro8 NA
BaP Bone Marrow 100 51 A C 0.0025 0.0022 1.1190 0.0042 0.0024 1 0 0 0 0 0 1 0 0 11 missense TTA TTC L F SNV This Study NGS NA A>C L>F 1 3 33 Leu Phe 11 Leu11 NA
BaP Bone Marrow 100 109 C A 0.0044 0.0037 1.1947 0.0013 0.0041 1 0 0 1 0 0 0 0 0 31 missense CCC ACC P T SNV This Study NGS NA C>A P>T 1 1 91 Pro Thr 31 Pro31 NA
BaP Bone Marrow 100 110 C T 0.0045 0.0052 1.1395 0.0051 0.0048 1 0 1 0 0 0 0 0 0 31 missense CCC CTC P L SNV This Study NGS NA C>T P>L 1 2 92 Pro Leu 31 Pro31 NA

Plots

First, some plots to look at the composition of the data.

By position:

# Number of mutations by position (grouped in histogram)
snvs_ins_del_clean %>%
  group_by(Position) %>%
  tally() %>%
  ggplot(aes(x = Position)) +
  geom_histogram() +
  ggtitle("Number of mutations by position (grouped in histogram)")

# Number of mutations by position, high granularity
snvs_ins_del_clean %>%
  group_by(Position, Alt) %>%
  tally() %>%
  ggplot(aes(x = Position, y = n)) +
  geom_bar(stat = "identity") +
  ggtitle("Number of mutations by position, high granularity")

# Number of mutations by position, colored by type
snvs_ins_del_clean %>%
  group_by(Position, Type, Alt) %>%
  tally() %>%
  ggplot(aes(x = Position, y = n, fill = Type)) +
  geom_bar(stat = "identity") +
  ggtitle("Number of mutations by position, colored by type")

# Number of mutations by position, colored by type, log scale
snvs_ins_del_clean %>%
  group_by(Position, Type, Alt) %>%
  tally() %>%
  ggplot(aes(x = Position, y = log2(n + 1), fill = Type)) +
  geom_bar(stat = "identity") +
  ggtitle("Number of mutations by position, colored by type, log scale")

This section explores the numbers of singletons, etc.

# Number of total mutations at a given position,
# including historical data, and including indels
# This removes mutations observed in multiple samples.
snvs_ins_del_clean %>%
  group_by(Position, Alt) %>%
  unique() %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 6152
snvs_ins_del_clean %>%
  group_by(Position, Alt) %>%
  tally() %>%
  ggplot(aes(x = factor(n))) +
  geom_histogram(stat = "count") +
  stat_count(
    geom = "text",
    color = "black",
    aes(label = ..count.., angle = 0),
    size = 2,
    position = position_stack(vjust = 1.1)
  ) +
  xlab("Number of times mutation observed") +
  ggtitle("Mutations per position: all mutations")

# Number of each mutation (position, alt base)
# including multiple counts of individual mutants
# This won't be reported but is here for curiosity.
snvs_ins_del_clean %>%
  group_by(Position, Alt) %>%
  mutate(num_occurrences_includes_counts = sum(Count)) %>%
  select(Position, num_occurrences_includes_counts) %>%
  unique() %>%
  group_by(num_occurrences_includes_counts) %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 2171
snvs_ins_del_clean %>%
  group_by(Position, Alt) %>%
  mutate(num_occurrences_includes_counts = sum(Count)) %>%
  select(Position, num_occurrences_includes_counts) %>%
  unique() %>%
  ggplot(aes(x = factor(num_occurrences_includes_counts))) +
  geom_histogram(stat = "count") +
  stat_count(
    geom = "text",
    color = "black",
    aes(label = ..count.., angle = 0),
    size = 2,
    position = position_stack(vjust = 1.1)
  ) +
  xlab("Number of times mutation observed") +
  ggtitle("All mutations")

# Number of mutations of various types per nucleotide
snvs_ins_del_clean %>%
  group_by(Position, Type, Alt) %>%
  tally() %>%
  ggplot(aes(x = factor(n), fill = Type)) +
  geom_histogram(stat = "count") +
  stat_count(
    geom = "text",
    color = "black",
    aes(label = ..count.., angle = 0),
    size = 2,
    position = position_stack(vjust = 0.9)
  ) +
  xlab("Number of times mutation observed") +
  ggtitle("Mutations by type: group by position and alternate allele")

# Number of mutations of various types per nucleotide
snvs_ins_del_clean %>%
  group_by(Position, Type) %>%
  tally() %>%
  ggplot(aes(x = factor(n), fill = Type)) +
  geom_histogram(stat = "count") +
  stat_count(
    geom = "text",
    color = "black",
    aes(label = ..count.., angle = 0),
    size = 2,
    position = position_stack(vjust = 0.9)
  ) +
  xlab("Number of times mutation observed") +
  ggtitle("Mutations by type: group by position")

# Number of mutations of various types per nucleotide
snvs_ins_del_clean %>%
  group_by(Position, Type, Alt, Tissue, Dose, Exposure) %>%
  tally() %>%
  ggplot(aes(x = factor(n), fill = Type)) +
  geom_histogram(stat = "count") +
  stat_count(
    geom = "text",
    color = "black",
    aes(label = ..count.., angle = 0),
    size = 2,
    position = position_stack(vjust = 0.9)
  ) +
  xlab("Number of times mutation observed") +
  ggtitle("Mutations by type: group by position, alternate allele, tissue, dose, and exposure")

# Number of mutations per codon
snvs_ins_del_clean %>%
  filter(Type == "SNV") %>%
  group_by(Codon, Type, `Alt Codon`) %>%
  drop_na(Codon) %>%
  tally() %>%
  ggplot(aes(x = factor(n), fill = Type)) +
  geom_histogram(binwidth = 1, stat = "count") +
  stat_count(
    geom = "text",
    color = "white",
    aes(label = ..count.., angle = 0),
    size = 2,
    position = position_stack(vjust = 0.5)
  ) +
  xlab("Number of times mutation observed") +
  ggtitle("Number of mutations per codon")

Here are some numbers to answer the question of how many singletons vs repeated mutations were observed in the data, broken down in various ways:

# Number of mutations in total
snvs_ins_del_clean %>% tally() # 6,465
## # A tibble: 1 × 1
##       n
##   <int>
## 1  6465
# Number of mutations per chemical
snvs_ins_del_clean %>%
  group_by(Exposure) %>%
  tally()
## # A tibble: 15 × 2
##    Exposure      n
##    <chr>     <int>
##  1 BaP        1919
##  2 BaP-IU     1008
##  3 CEDU         14
##  4 Control    1727
##  5 EMS          11
##  6 ENU         809
##  7 Ercc1 -/m    34
##  8 NDBzA        80
##  9 NDMA         47
## 10 PRC         205
## 11 Sunlight     64
## 12 TEM         258
## 13 UVB         120
## 14 X-ray        91
## 15 Xpa -/-      78
#    Exposure      n
#  1 BaP        1919
#  2 BaP-IU     1008
#  3 CEDU         14
#  4 Control    1727
#  5 EMS          11
#  6 ENU         809
#  7 Ercc1 -/m    34
#  8 NDBzA        80
#  9 NDMA         47
# 10 PRC         205
# 11 Sunlight     64
# 12 TEM         258
# 13 UVB         120
# 14 X-ray        91
# 15 Xpa -/-      78

# Number of mutations by type
snvs_ins_del_clean %>%
  group_by(Type) %>%
  tally()
## # A tibble: 4 × 2
##   Type          n
##   <chr>     <int>
## 1 Complex      20
## 2 Deletion   1080
## 3 Insertion   218
## 4 SNV        5147
#   Type          n
# 1 Complex      20
# 2 Deletion   1080
# 3 Insertion   218
# 4 SNV        5147

# How many singleton mutations?
snvs_ins_del_clean %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n == 1) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  ungroup() %>%
  count() # 1,225
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1225
# tally() %>% pull(n) %>% sum()

# How many non-singleton mutations?
snvs_ins_del_clean %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n > 1) %>%
  group_by(n) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum() # There are 946
## [1] 946
# How many singleton mutations, SNVs only?
snvs_ins_del_clean %>%
  filter(Type == "SNV") %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n == 1) %>%
  group_by(n) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum() # There are 707
## [1] 707
# How many non-singleton mutations, SNVs only?
snvs_ins_del_clean %>%
  filter(Type == "SNV") %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n > 1) %>%
  group_by(n) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum() # There are 690
## [1] 690
# How many singleton mutations, deletions only?
snvs_ins_del_clean %>%
  filter(Type == "Deletion") %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n == 1) %>%
  group_by(n) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum() # There are 377
## [1] 377
# How many non-singleton mutations, deletions only?
snvs_ins_del_clean %>%
  filter(Type == "Deletion") %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n > 1) %>%
  group_by(n) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum() # There are 222
## [1] 222
# How many singleton mutations, insertions only?
snvs_ins_del_clean %>%
  filter(Type == "Insertion") %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n == 1) %>%
  group_by(n) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum() # There are 169
## [1] 169
# How many non-singleton mutations, insertions only?
snvs_ins_del_clean %>%
  filter(Type == "Insertion") %>%
  group_by(Position, Alt) %>%
  tally() %>%
  filter(n > 1) %>%
  group_by(n) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum() # There are 20
## [1] 20
# Codons, singletons
snvs_ins_del_clean %>%
  filter(Type == "SNV") %>%
  group_by(Codon, `Alt Codon`) %>%
  drop_na(Codon) %>%
  tally() %>%
  group_by(n) %>%
  filter(n == 1) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 708
# Codons, non-singletons
snvs_ins_del_clean %>%
  filter(Type == "SNV") %>%
  group_by(Codon, `Alt Codon`) %>%
  drop_na(Codon) %>%
  tally() %>%
  group_by(n) %>%
  filter(n > 1) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 690
# Codons, number of unique loci (codons) that are mutated more than once
snvs_ins_del_clean %>%
  filter(Type == "SNV") %>%
  group_by(Codon) %>%
  drop_na(Codon) %>%
  tally() %>%
  group_by(n) %>%
  filter(n > 1) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 498
# Total number of codons that are mutated
snvs_ins_del_clean %>%
  drop_na(Codon) %>%
  pull(Codon) %>%
  unique() %>%
  length()
## [1] 691
snvs_ins_del_clean %>%
  filter(Type == "SNV") %>%
  drop_na(Codon) %>%
  pull(Codon) %>%
  unique() %>%
  length()
## [1] 682
snvs_ins_del_clean %>%
  group_by(Codon) %>%
  tally() %>%
  group_by(n) %>%
  filter(n > 1) %>%
  dplyr::rename(Number_of_times_observed = n) %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 509

Read lacZ sequence

Compare the MutaMouse lacZ sequence to the reference sequence. This also

# Manually downloaded...
# laczref <- readDNAStringSet("data/raw/lacZ.reference.paper.fa", format="fasta",
#               use.names=TRUE, with.qualities=FALSE)

################################################################################
# Load sequence from NCBI
################################################################################

betagal <- esearch("beta galactosidase", "protein")
betagal_x <- efetch(betagal, rettype = "fasta", retmode = "xml")

tmp <- tempfile()
lacZ_NCBI <- efetch(
  uid = "V00296.1",
  db = "nucleotide",
  retmode = "text",
  rettype = "fasta",
  outfile = tmp
)
laczref <- readDNAStringSet(tmp)

lacz_mutamouse <- readDNAStringSet("data/raw/lacZ.fa",
  format = "fasta",
  use.names = TRUE,
  with.qualities = FALSE
)

alignment <- msa(c(laczref, lacz_mutamouse))
## use default substitution matrix
print(alignment)
## CLUSTAL 2.1  
## 
## Call:
##    msa(c(laczref, lacz_mutamouse))
## 
## MsaDNAMultipleAlignment with 2 rows and 3096 columns
##     aln                                                    names
## [1] ---ACCATGATTACGGATTCACTGG-...GGTCTGGTGTCAAAAATAATAATAA V00296.1 E. coli ...
## [2] ATGACCATGATTACGGATTCACTGGA...GGTCTGGTGTCAAAAATAATAATAA lacZ
## Con ???ACCATGATTACGGATTCACTGG?...GGTCTGGTGTCAAAAATAATAATAA Consensus
laczref_aa <- Biostrings::translate(laczref)
lacz_mutamouse_aa <- Biostrings::translate(lacz_mutamouse)

alignment <- msa(c(laczref, lacz_mutamouse))
## use default substitution matrix
print(alignment, show = "complete", showNames = T, type = "upperlower")
## 
## MsaDNAMultipleAlignment with 2 rows and 3096 columns
##     aln (1..54)                                            names
## [1] ---ACCATGATTACGGATTCACTGG---------------CCGTCGTTTTACAA V00296.1 E. coli ...
## [2] ATGACCATGATTACGGATTCACTGGAATTCCCGGGGATCCCCGTCGTTTTACAA lacZ
## Con ...................................................... Consensus 
## 
##     aln (55..108)                                          names
## [1] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT V00296.1 E. coli ...
## [2] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT lacZ
## Con ...................................................... Consensus 
## 
##     aln (109..162)                                         names
## [1] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC V00296.1 E. coli ...
## [2] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC lacZ
## Con ...................................................... Consensus 
## 
##     aln (163..216)                                         names
## [1] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA V00296.1 E. coli ...
## [2] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA lacZ
## Con ...................................................... Consensus 
## 
##     aln (217..270)                                         names
## [1] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC V00296.1 E. coli ...
## [2] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC lacZ
## Con ...................................................... Consensus 
## 
##     aln (271..324)                                         names
## [1] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC V00296.1 E. coli ...
## [2] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC lacZ
## Con ...................................................... Consensus 
## 
##     aln (325..378)                                         names
## [1] GTAACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG V00296.1 E. coli ...
## [2] GTGACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG lacZ
## Con ...................................................... Consensus 
## 
##     aln (379..432)                                         names
## [1] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG V00296.1 E. coli ...
## [2] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG lacZ
## Con ...................................................... Consensus 
## 
##     aln (433..486)                                         names
## [1] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG V00296.1 E. coli ...
## [2] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG lacZ
## Con ...................................................... Consensus 
## 
##     aln (487..540)                                         names
## [1] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC V00296.1 E. coli ...
## [2] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC lacZ
## Con ...................................................... Consensus 
## 
##     aln (541..594)                                         names
## [1] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGTTGGAGT V00296.1 E. coli ...
## [2] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGCTGGAGT lacZ
## Con ...................................................... Consensus 
## 
##     aln (595..648)                                         names
## [1] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT V00296.1 E. coli ...
## [2] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT lacZ
## Con ...................................................... Consensus 
## 
##     aln (649..702)                                         names
## [1] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC V00296.1 E. coli ...
## [2] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC lacZ
## Con ...................................................... Consensus 
## 
##     aln (703..756)                                         names
## [1] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG V00296.1 E. coli ...
## [2] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG lacZ
## Con ...................................................... Consensus 
## 
##     aln (757..810)                                         names
## [1] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA V00296.1 E. coli ...
## [2] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA lacZ
## Con ...................................................... Consensus 
## 
##     aln (811..864)                                         names
## [1] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT V00296.1 E. coli ...
## [2] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT lacZ
## Con ...................................................... Consensus 
## 
##     aln (865..918)                                         names
## [1] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG V00296.1 E. coli ...
## [2] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG lacZ
## Con ...................................................... Consensus 
## 
##     aln (919..972)                                         names
## [1] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC V00296.1 E. coli ...
## [2] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC lacZ
## Con ...................................................... Consensus 
## 
##     aln (973..1026)                                        names
## [1] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG V00296.1 E. coli ...
## [2] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1027..1080)                                       names
## [1] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT V00296.1 E. coli ...
## [2] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT lacZ
## Con ...................................................... Consensus 
## 
##     aln (1081..1134)                                       names
## [1] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG V00296.1 E. coli ...
## [2] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1135..1188)                                       names
## [1] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG V00296.1 E. coli ...
## [2] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1189..1242)                                       names
## [1] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT V00296.1 E. coli ...
## [2] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT lacZ
## Con ...................................................... Consensus 
## 
##     aln (1243..1296)                                       names
## [1] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG V00296.1 E. coli ...
## [2] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1297..1350)                                       names
## [1] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG V00296.1 E. coli ...
## [2] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1351..1404)                                       names
## [1] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA V00296.1 E. coli ...
## [2] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA lacZ
## Con ...................................................... Consensus 
## 
##     aln (1405..1458)                                       names
## [1] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT V00296.1 E. coli ...
## [2] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT lacZ
## Con ...................................................... Consensus 
## 
##     aln (1459..1512)                                       names
## [1] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT V00296.1 E. coli ...
## [2] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT lacZ
## Con ...................................................... Consensus 
## 
##     aln (1513..1566)                                       names
## [1] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG V00296.1 E. coli ...
## [2] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1567..1620)                                       names
## [1] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC V00296.1 E. coli ...
## [2] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC lacZ
## Con ...................................................... Consensus 
## 
##     aln (1621..1674)                                       names
## [1] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC V00296.1 E. coli ...
## [2] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC lacZ
## Con ...................................................... Consensus 
## 
##     aln (1675..1728)                                       names
## [1] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG V00296.1 E. coli ...
## [2] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1729..1782)                                       names
## [1] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC V00296.1 E. coli ...
## [2] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC lacZ
## Con ...................................................... Consensus 
## 
##     aln (1783..1836)                                       names
## [1] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG V00296.1 E. coli ...
## [2] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1837..1890)                                       names
## [1] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG V00296.1 E. coli ...
## [2] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG lacZ
## Con ...................................................... Consensus 
## 
##     aln (1891..1944)                                       names
## [1] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC V00296.1 E. coli ...
## [2] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC lacZ
## Con ...................................................... Consensus 
## 
##     aln (1945..1998)                                       names
## [1] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT V00296.1 E. coli ...
## [2] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT lacZ
## Con ...................................................... Consensus 
## 
##     aln (1999..2052)                                       names
## [1] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG V00296.1 E. coli ...
## [2] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG lacZ
## Con ...................................................... Consensus 
## 
##     aln (2053..2106)                                       names
## [1] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC V00296.1 E. coli ...
## [2] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC lacZ
## Con ...................................................... Consensus 
## 
##     aln (2107..2160)                                       names
## [1] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC V00296.1 E. coli ...
## [2] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC lacZ
## Con ...................................................... Consensus 
## 
##     aln (2161..2214)                                       names
## [1] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC V00296.1 E. coli ...
## [2] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC lacZ
## Con ...................................................... Consensus 
## 
##     aln (2215..2268)                                       names
## [1] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG V00296.1 E. coli ...
## [2] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG lacZ
## Con ...................................................... Consensus 
## 
##     aln (2269..2322)                                       names
## [1] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG V00296.1 E. coli ...
## [2] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG lacZ
## Con ...................................................... Consensus 
## 
##     aln (2323..2376)                                       names
## [1] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT V00296.1 E. coli ...
## [2] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT lacZ
## Con ...................................................... Consensus 
## 
##     aln (2377..2430)                                       names
## [1] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC V00296.1 E. coli ...
## [2] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC lacZ
## Con ...................................................... Consensus 
## 
##     aln (2431..2484)                                       names
## [1] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG V00296.1 E. coli ...
## [2] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG lacZ
## Con ...................................................... Consensus 
## 
##     aln (2485..2538)                                       names
## [1] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC V00296.1 E. coli ...
## [2] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC lacZ
## Con ...................................................... Consensus 
## 
##     aln (2539..2592)                                       names
## [1] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT V00296.1 E. coli ...
## [2] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT lacZ
## Con ...................................................... Consensus 
## 
##     aln (2593..2646)                                       names
## [1] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA V00296.1 E. coli ...
## [2] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA lacZ
## Con ...................................................... Consensus 
## 
##     aln (2647..2700)                                       names
## [1] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG V00296.1 E. coli ...
## [2] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG lacZ
## Con ...................................................... Consensus 
## 
##     aln (2701..2754)                                       names
## [1] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC V00296.1 E. coli ...
## [2] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC lacZ
## Con ...................................................... Consensus 
## 
##     aln (2755..2808)                                       names
## [1] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC V00296.1 E. coli ...
## [2] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC lacZ
## Con ...................................................... Consensus 
## 
##     aln (2809..2862)                                       names
## [1] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA V00296.1 E. coli ...
## [2] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA lacZ
## Con ...................................................... Consensus 
## 
##     aln (2863..2916)                                       names
## [1] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA V00296.1 E. coli ...
## [2] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA lacZ
## Con ...................................................... Consensus 
## 
##     aln (2917..2970)                                       names
## [1] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG V00296.1 E. coli ...
## [2] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG lacZ
## Con ...................................................... Consensus 
## 
##     aln (2971..3024)                                       names
## [1] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA V00296.1 E. coli ...
## [2] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA lacZ
## Con ...................................................... Consensus 
## 
##     aln (3025..3078)                                       names
## [1] GTATCGGCGGAATTCCAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG V00296.1 E. coli ...
## [2] GTATCGGCGGAATTACAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG lacZ
## Con ...................................................... Consensus 
## 
##     aln (3079..3096)   names
## [1] TGTCAAAAATAATAATAA V00296.1 E. coli ...
## [2] TGTCAAAAATAATAATAA lacZ
## Con .................. Consensus
# help("print,MsaDNAMultipleAlignment-method")
# msa::msaPrettyPrint(alignment, output = "asis")

positions <- snvs_ins_del_clean %>%
  dplyr::arrange(Position) %>%
  pull(Position)

positions_other_studies <- snvs_ins_del_clean %>%
  dplyr::filter(!Study == "This Study") %>%
  dplyr::arrange(Position) %>%
  pull(Position)

How many mutations did we observe at CpG sites? SNVs only.

# Where is ref C or G?
snvs %>%
  group_by(Ref) %>%
  tally()
## # A tibble: 4 × 2
##   Ref       n
##   <chr> <int>
## 1 A       236
## 2 C      1402
## 3 G      1749
## 4 T       361
snvs %>%
  group_by(Ref) %>%
  dplyr::mutate(purine_pyrimidine = ifelse(Ref %in% c("C", "G"), yes = "C/G", no = "A/T")) %>%
  tally()
## # A tibble: 4 × 2
##   Ref       n
##   <chr> <int>
## 1 A       236
## 2 C      1402
## 3 G      1749
## 4 T       361
snvs %>%
  group_by(Ref) %>%
  dplyr::mutate(purine_pyrimidine = ifelse(Ref %in% c("C", "G"), yes = "C/G", no = "A/T")) %>%
  dplyr::group_by(purine_pyrimidine) %>%
  tally() %>%
  dplyr::mutate(freq = round(n / sum(n), 3))
## # A tibble: 2 × 3
##   purine_pyrimidine     n  freq
##   <chr>             <int> <dbl>
## 1 A/T                 597 0.159
## 2 C/G                3151 0.841
# Break down by compound
snvs_genvisr <- snvs %>%
  dplyr::mutate(sample = Exposure, reference = Ref, variant = Alt) %>%
  dplyr::select(sample, reference, variant)
GenVisR::TvTi(snvs_genvisr, fileType = "MGI", progress = F)
## NULL
GenVisR::TvTi(snvs_genvisr, fileType = "MGI", out = "data", progress = F)
## $main
##          trans_tranv  sample Freq       Prop
## 1  A->C or T->G (TV)     BaP   17 0.01305684
## 2  A->G or T->C (TI)     BaP   12 0.00921659
## 3  A->T or T->A (TV)     BaP   46 0.03533026
## 4  G->A or C->T (TI)     BaP  242 0.18586790
## 5  G->C or C->G (TV)     BaP  256 0.19662058
## 6  G->T or C->A (TV)     BaP  729 0.55990783
## 7  A->C or T->G (TV)  BaP-IU   31 0.04372355
## 8  A->G or T->C (TI)  BaP-IU   14 0.01974612
## 9  A->T or T->A (TV)  BaP-IU   34 0.04795487
## 10 G->A or C->T (TI)  BaP-IU  137 0.19322990
## 11 G->C or C->G (TV)  BaP-IU  137 0.19322990
## 12 G->T or C->A (TV)  BaP-IU  356 0.50211566
## 13 A->C or T->G (TV) Control   28 0.03641092
## 14 A->G or T->C (TI) Control   32 0.04161248
## 15 A->T or T->A (TV) Control   40 0.05201560
## 16 G->A or C->T (TI) Control  409 0.53185956
## 17 G->C or C->G (TV) Control   55 0.07152146
## 18 G->T or C->A (TV) Control  205 0.26657997
## 25 A->C or T->G (TV)     ENU   29 0.04991394
## 26 A->G or T->C (TI)     ENU   44 0.07573150
## 27 A->T or T->A (TV)     ENU  160 0.27538726
## 28 G->A or C->T (TI)     ENU  199 0.34251291
## 29 G->C or C->G (TV)     ENU    7 0.01204819
## 30 G->T or C->A (TV)     ENU  142 0.24440620
## 31 A->C or T->G (TV)     PRC    6 0.03208556
## 32 A->G or T->C (TI)     PRC   21 0.11229947
## 33 A->T or T->A (TV)     PRC   37 0.19786096
## 34 G->A or C->T (TI)     PRC   78 0.41711230
## 35 G->C or C->G (TV)     PRC    5 0.02673797
## 36 G->T or C->A (TV)     PRC   40 0.21390374
## 37 A->C or T->G (TV)     TEM   13 0.06500000
## 38 A->G or T->C (TI)     TEM   13 0.06500000
## 39 A->T or T->A (TV)     TEM   20 0.10000000
## 40 G->A or C->T (TI)     TEM   87 0.43500000
## 41 G->C or C->G (TV)     TEM   12 0.06000000
## 42 G->T or C->A (TV)     TEM   55 0.27500000
## 
## $expect
## NULL
# All together
snvs_genvisr <- snvs %>%
  dplyr::mutate(sample = "All lacZ Mutations", reference = Ref, variant = Alt) %>%
  dplyr::select(sample, reference, variant)
GenVisR::TvTi(snvs_genvisr, fileType = "MGI", progress = F)

## NULL
tv_ti_table <- GenVisR::TvTi(snvs_genvisr, fileType = "MGI", out = "data", progress = F)
tv_ti_table
## $main
##         trans_tranv             sample Freq       Prop
## 1 A->C or T->G (TV) All lacZ Mutations  124 0.03308431
## 2 A->G or T->C (TI) All lacZ Mutations  136 0.03628602
## 3 A->T or T->A (TV) All lacZ Mutations  337 0.08991462
## 4 G->A or C->T (TI) All lacZ Mutations 1152 0.30736393
## 5 G->C or C->G (TV) All lacZ Mutations  472 0.12593383
## 6 G->T or C->A (TV) All lacZ Mutations 1527 0.40741729
## 
## $expect
## NULL
tv_ti_ratio <- tv_ti_table$main %>% dplyr::mutate(Class = str_extract(string = trans_tranv, pattern = regex("\\(\\w++\\)")))
tv_ti_ratio %>%
  dplyr::group_by(Class) %>%
  dplyr::mutate(class_count = sum(Freq))
## # A tibble: 6 × 6
## # Groups:   Class [2]
##   trans_tranv       sample              Freq   Prop Class class_count
##   <fct>             <fct>              <int>  <dbl> <chr>       <int>
## 1 A->C or T->G (TV) All lacZ Mutations   124 0.0331 (TV)         2460
## 2 A->G or T->C (TI) All lacZ Mutations   136 0.0363 (TI)         1288
## 3 A->T or T->A (TV) All lacZ Mutations   337 0.0899 (TV)         2460
## 4 G->A or C->T (TI) All lacZ Mutations  1152 0.307  (TI)         1288
## 5 G->C or C->G (TV) All lacZ Mutations   472 0.126  (TV)         2460
## 6 G->T or C->A (TV) All lacZ Mutations  1527 0.407  (TV)         2460

Nucleotide mutation by type figure

################################################################################
# lacZ Sequence Visualization
################################################################################

lacZ_vis <- SequenceTrack(lacz_mutamouse, ucscChromosomeNames = FALSE)
# plotTracks(lacZ_vis, chromosome = "lacZ", from = 10, to = 40)
snvs_ins_del_clean$chromosome <- "lacZ"
lacZ_granges <- makeGRangesFromDataFrame(snvs_ins_del_clean,
  keep.extra.columns = T,
  ignore.strand = T,
  seqinfo = NULL,
  seqnames.field = "chromosome",
  start.field = "Position",
  end.field = "Position",
  starts.in.df.are.0based = FALSE
)

lacZ_annotation <- AnnotationTrack(lacZ_granges, name = "lacZ data")
# plotTracks(list(lacZ_annotation, lacZ_vis), from=2200, to=2375)

# Some parameters to use in other tracks...
roundup <- function(x) {
  round(x + 5, -1)
}
# Set a y limit to be reused across all tracks
overall_ylim_nt <- c(0, roundup(snvs_ins_del_clean %>%
  group_by(Position) %>%
  tally() %>% pull(n) %>% max()))
# overall_ylim = NULL

# Value to scale axis text
axis_scale <- 0.5

dat_all <- snvs_ins_del_clean %>%
  group_by(Position) %>%
  tally()
all_mutations <- DataTrack(
  data = dat_all$n,
  start = dat_all$Position,
  end = dat_all$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "All Types",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)
dat_nonsense <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "nonsense") %>%
  group_by(Position) %>%
  tally()
nonsense <- DataTrack(
  data = dat_nonsense$n,
  start = dat_nonsense$Position,
  end = dat_nonsense$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Nonsense",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)
dat_missense <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  group_by(Position) %>%
  tally()
missense <- DataTrack(
  data = dat_missense$n,
  start = dat_missense$Position,
  end = dat_missense$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Missense",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)
dat_frameshift <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "frameshift") %>%
  group_by(Position) %>%
  tally()
frameshift <- DataTrack(
  data = dat_frameshift$n,
  start = dat_frameshift$Position,
  end = dat_frameshift$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Frameshift",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)
dat_insertions <- snvs_ins_del_clean %>%
  dplyr::filter(Type == "Insertion") %>%
  group_by(Position) %>%
  tally()
insertions <- DataTrack(
  data = dat_insertions$n,
  start = dat_insertions$Position,
  end = dat_insertions$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Insertions",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)
dat_deletions <- snvs_ins_del_clean %>%
  dplyr::filter(Type == "Deletion") %>%
  group_by(Position) %>%
  tally()
deletions <- DataTrack(
  data = dat_deletions$n,
  start = dat_deletions$Position,
  end = dat_deletions$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Deletions",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)

# plotTracks(list(lacZ_vis, all_mutations), type="horizon")

domains_feature_fill <- "slategray4"

domains_nt <- AnnotationTrack(
  start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
  end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
  chromosome = "lacZ",
  fill = domains_feature_fill,
  id = c(
    "Sugar Binding",
    "β-Galactosidase",
    "TIM Barrel",
    "β-Galactosidase Small Chain"
  ),
  genome = "lacZ", name = "Domains"
)

domain_ht_nt <- HighlightTrack(
  trackList = list(
    all_mutations,
    missense,
    nonsense,
    frameshift,
    insertions,
    deletions
  ),
  start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
  end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
  chromosome = "lacZ",
  fill = c(
    "#FFE5E5", # rgb(255, 229, 229)
    "#F4FAED", # rgb(244, 250, 237),
    "#F0EAF5", # rgb(240, 234, 245),
    "#FCF0E6"
  ), # rgb(252, 240, 230)), #"snow3","snow2"
  col = "#000000FF", # rgb(0,0,0, alpha=1)
  lwd = 0.2,
  inBackground = T
)

# hs <- as.numeric(levels(ranked_hotspots[["Position"]]))
# hotspots_range <- GRanges(seqnames = "lacZ",
#                   ranges = IRanges(start = hs,
#                                    end = hs))
#
# deTrack <- AnnotationTrack(range = hotspots_range,
#                            genome = "lacZ",
#                            chromosome = "lacZ",
#                            name = "Hotspots",
#                            stacking = "squish")

gtrack <- GenomeAxisTrack(
  add53 = TRUE,
  littleTicks = TRUE,
  name = "lacZ gene",
  showId = T
)

displayPars(domains_nt) <- list(size = 5)

plotTracks(
  list( # lacZ_vis,deTrack,
    domains_nt,
    domain_ht_nt,
    gtrack
  ),
  featureAnnotation = "id",
  fontcolor.feature = "white",
  background.title = "slategrey",
  background.panel = "transparent",
  fontsize.feature = 7,
  stackHeight = 1
)

Nucleotide mutation by sequencing technology figure

dat_nt_ngs <- snvs_ins_del_clean %>%
  dplyr::filter(Technology == "NGS") %>%
  group_by(Position) %>%
  tally()
ngs_nt <- DataTrack(
  data = dat_nt_ngs$n,
  start = dat_nt_ngs$Position,
  end = dat_nt_ngs$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "NGS",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)

dat_nt_sanger <- snvs_ins_del_clean %>%
  dplyr::filter(Technology == "Sanger") %>%
  group_by(Position) %>%
  tally()
sanger_nt <- DataTrack(
  data = dat_nt_sanger$n,
  start = dat_nt_sanger$Position,
  end = dat_nt_sanger$Position,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Sanger",
  type = "h",
  ylim = overall_ylim_nt,
  cex.axis = axis_scale
)

domain_ht_nt_tech <- HighlightTrack(
  trackList = list(
    all_mutations,
    ngs_nt,
    sanger_nt
  ),
  start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
  end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
  chromosome = "lacZ",
  fill = c(
    "#FFE5E5", # rgb(255, 229, 229, max = 255)
    "#F4FAED", # rgb(244, 250, 237, max = 255),
    "#F0EAF5", # rgb(240, 234, 245, max = 255),
    "#FCF0E6"
  ), # rgb(252, 240, 230, max = 255)), #"snow3","snow2"
  col = "#000000FF", # rgb(0,0,0, alpha=1)
  lwd = 0.2,
  inBackground = T
)

plotTracks(
  list( # lacZ_vis,deTrack,
    domains_nt,
    domain_ht_nt_tech,
    gtrack
  ),
  featureAnnotation = "id",
  fontcolor.feature = "white",
  background.title = "slategrey", # brown?
  background.panel = "transparent",
  fontsize.feature = 7,
  stackHeight = 1
)

Amino acid by sequencing technology figure

################################################################################
# Beta-Gal Protein Visualization
################################################################################

lacZ_vis_aa <- ProteinSequenceTrack(laczref_aa,
  labelPos = "below",
  chromosome = "lacZ",
  name = "β-Gal",
  cex = 0.5,
  range = IRanges(
    start = c(49, 221, 336, 749),
    end = c(219, 334, 630, 1022),
    names = c(
      "Sugar Binding",
      "β-Galactosidase",
      "TIM Barrel",
      "β-Galactosidase\nSmall Chain"
    )
  )
)
paxTrack <- ProteinAxisTrack(
  littleTicks = TRUE,
  addNC = F
)

# Sugar Binding (49-219; PF02837)
# β-Galactosidase (221-334; PF00703)
# TIM Barrel (336-630; PF02836)
# β-Galactosidase Small Chain (749-1022; PF02929)
domains <- AnnotationTrack(
  start = c(49, 221, 336, 749),
  end = c(219, 334, 630, 1022),
  chromosome = "lacZ",
  name = "β-Gal",
  fill = domains_feature_fill,
  id = c(
    "Sugar Binding",
    "β-Galactosidase",
    "TIM Barrel",
    "β-Galactosidase Small Chain"
  ),
  genome = "lacZ"
)

# Set a y limit to be reused across all tracks
overall_ylim_codon <- c(0, snvs_ins_del_clean %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na() %>% pull(n) %>% max())
# overall_ylim = NULL

dat_aa <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
all_codon_mutations <- DataTrack(
  data = dat_aa$n,
  start = dat_aa$Codon,
  end = dat_aa$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Missense Mutations",
  type = "h",
  ylim = overall_ylim_codon
)

dat_aa_sanger <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  filter(Technology == "Sanger") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
codon_mutations_sanger <- DataTrack(
  data = dat_aa_sanger$n,
  start = dat_aa_sanger$Codon,
  end = dat_aa_sanger$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Sanger",
  type = "h",
  ylim = overall_ylim_codon
)

dat_aa_ngs <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  filter(Technology == "NGS") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
codon_mutations_ngs <- DataTrack(
  data = dat_aa_ngs$n,
  start = dat_aa_ngs$Codon,
  end = dat_aa_ngs$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "NGS",
  type = "h",
  ylim = overall_ylim_codon
)

plotTracks(list(lacZ_vis_aa, domains, all_codon_mutations), from = 1, to = 100)

plotTracks(list(lacZ_vis_aa, domains, all_codon_mutations), from = 1, to = 500)

domain_ht <- HighlightTrack(
  trackList = list(
    all_codon_mutations,
    codon_mutations_ngs,
    codon_mutations_sanger
  ),
  start = c(49, 221, 336, 749),
  end = c(219, 334, 630, 1022),
  chromosome = "lacZ",
  fill = c(
    "#FFE5E5", # rgb(255, 229, 229)
    "#F4FAED", # rgb(244, 250, 237),
    "#F0EAF5", # rgb(240, 234, 245),
    "#FCF0E6"
  ), # rgb(252, 240, 230)), #"snow3","snow2"
  col = "#000000FF", # rgb(0,0,0, alpha=1)
  lwd = 0.2,
  inBackground = T
)

plotTracks(
  list( # lacZ_vis_aa,
    domains,
    domain_ht,
    paxTrack
  ),
  featureAnnotation = "id",
  fontcolor.feature = "white",
  background.title = "slategrey", # brown?
  background.panel = "transparent",
  fontsize.feature = 7,
  stackHeight = 1
)

#           featureAnnotation = "id",
#           fontcolor.feature = "darkblue",
#           background.title = "brown",
#           background.panel = "transparent",
#           fontsize.feature = 7)

Amino acid by mutation type figure

dat_nonsense_aa <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "nonsense") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
nonsense_aa <- DataTrack(
  data = dat_nonsense_aa$n,
  start = dat_nonsense_aa$Codon,
  end = dat_nonsense_aa$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Nonsense",
  type = "h",
  ylim = overall_ylim_codon,
  cex.axis = axis_scale
)
dat_missense_aa <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
missense_aa <- DataTrack(
  data = dat_missense_aa$n,
  start = dat_missense_aa$Codon,
  end = dat_missense_aa$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Missense",
  type = "h",
  ylim = overall_ylim_codon,
  cex.axis = axis_scale
)
dat_frameshift_aa <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "frameshift") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
frameshift_aa <- DataTrack(
  data = dat_frameshift_aa$n,
  start = dat_frameshift_aa$Codon,
  end = dat_frameshift_aa$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Frameshift",
  type = "h",
  ylim = overall_ylim_codon,
  cex.axis = axis_scale
)
dat_insertions_aa <- snvs_ins_del_clean %>%
  dplyr::filter(Type == "Insertion") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
insertions_aa <- DataTrack(
  data = dat_insertions_aa$n,
  start = dat_insertions_aa$Codon,
  end = dat_insertions_aa$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Insertions",
  type = "h",
  ylim = overall_ylim_codon,
  cex.axis = axis_scale
)
dat_deletions_aa <- snvs_ins_del_clean %>%
  dplyr::filter(Type == "Deletion") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
deletions_aa <- DataTrack(
  data = dat_deletions_aa$n,
  start = dat_deletions_aa$Codon,
  end = dat_deletions_aa$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Deletions",
  type = "h",
  ylim = overall_ylim_codon,
  cex.axis = axis_scale
)

domain_ht_aa_type <- HighlightTrack(
  trackList = list( # frameshift_aa,
    # insertions_aa,
    # deletions_aa,
    all_codon_mutations,
    missense_aa,
    nonsense_aa
  ),
  start = c(49, 221, 336, 749),
  end = c(219, 334, 630, 1022),
  chromosome = "lacZ",
  fill = c(
    "#FFE5E5", # rgb(255, 229, 229)
    "#F4FAED", # rgb(244, 250, 237),
    "#F0EAF5", # rgb(240, 234, 245),
    "#FCF0E6"
  ), # rgb(252, 240, 230)), #"snow3","snow2"
  col = "#000000FF", # rgb(0,0,0, alpha=1)
  lwd = 0.2,
  inBackground = T
)

plotTracks(
  list(
    domains,
    domain_ht_aa_type,
    paxTrack
  ),
  featureAnnotation = "id",
  fontcolor.feature = "white",
  background.title = "slategrey", # brown?
  background.panel = "transparent",
  fontsize.feature = 7,
  stackHeight = 1
)

# Amino acids, spontaneous mutations only

dat_spontaneous <- snvs_ins_del_clean %>%
  dplyr::filter(Exposure == "Control") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
# 378 codons from spontaneous (controls)

spontaneous_aa <- DataTrack(
  data = dat_spontaneous$n,
  start = dat_spontaneous$Codon,
  end = dat_spontaneous$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Spontaneous Mutations",
  type = "h",
  ylim = c(0, max(dat_spontaneous$n)),
  cex.axis = axis_scale
)

domain_ht_aa_spontaneous <- HighlightTrack(
  trackList = list(
    spontaneous_aa
  ),
  start = c(49, 221, 336, 749),
  end = c(219, 334, 630, 1022),
  chromosome = "lacZ",
  fill = c(
    "#FFE5E5", # rgb(255, 229, 229)
    "#F4FAED", # rgb(244, 250, 237),
    "#F0EAF5", # rgb(240, 234, 245),
    "#FCF0E6"
  ), # rgb(252, 240, 230)), #"snow3","snow2"
  col = "#000000FF", # rgb(0,0,0, alpha=1)
  lwd = 0.2,
  inBackground = T
)

plotTracks(
  list(
    domains,
    domain_ht_aa_spontaneous,
    paxTrack
  ),
  featureAnnotation = "id",
  fontcolor.feature = "white",
  background.title = "slategrey", # brown?
  background.panel = "transparent",
  fontsize.feature = 7,
  stackHeight = 1
)

Amino acids, mutagen-induced mutations only

dat_induced <- snvs_ins_del_clean %>%
  dplyr::filter(!Exposure == "Control") %>%
  group_by(Codon) %>%
  tally() %>%
  drop_na()
# 378 codons from spontaneous (controls)

induced_aa <- DataTrack(
  data = dat_induced$n,
  start = dat_induced$Codon,
  end = dat_induced$Codon,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Induced Mutations",
  type = "h",
  ylim = c(0, max(dat_induced$n)),
  cex.axis = axis_scale
)

domain_ht_aa_induced <- HighlightTrack(
  trackList = list(
    induced_aa
  ),
  start = c(49, 221, 336, 749),
  end = c(219, 334, 630, 1022),
  chromosome = "lacZ",
  fill = c(
    "#FFE5E5", # rgb(255, 229, 229)
    "#F4FAED", # rgb(244, 250, 237),
    "#F0EAF5", # rgb(240, 234, 245),
    "#FCF0E6"
  ), # rgb(252, 240, 230)), #"snow3","snow2"
  col = "#000000FF", # rgb(0,0,0, alpha=1)
  lwd = 0.2,
  inBackground = T
)

plotTracks(
  list(
    domains,
    domain_ht_aa_induced,
    paxTrack
  ),
  featureAnnotation = "id",
  fontcolor.feature = "white",
  background.title = "slategrey", # brown?
  background.panel = "transparent",
  fontsize.feature = 7,
  stackHeight = 1
)

Silent mutations, Fig S3

dat_silent <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "silent") %>%
  group_by(PositionRef) %>%
  tally() %>%
  drop_na()
# 378 codons from spontaneous (controls)

silent_aa <- DataTrack(
  data = dat_silent$n,
  start = dat_silent$PositionRef,
  end = dat_silent$PositionRef,
  chromosome = "lacZ",
  genome = "lacZ",
  name = "Silent Mutations",
  type = "h",
  ylim = c(0, max(dat_silent$n)),
  cex.axis = axis_scale
)

domain_ht_aa_silent <- HighlightTrack(
  trackList = list(
    silent_aa
  ),
  start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
  end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
  chromosome = "lacZ",
  fill = c(
    "#FFE5E5", # rgb(255, 229, 229)
    "#F4FAED", # rgb(244, 250, 237),
    "#F0EAF5", # rgb(240, 234, 245),
    "#FCF0E6"
  ), # rgb(252, 240, 230)), #"snow3","snow2"
  col = "#000000FF", # rgb(0,0,0, alpha=1)
  lwd = 0.2,
  inBackground = T
)

plotTracks(
  list(
    domains_nt,
    domain_ht_aa_silent,
    gtrack
  ),
  featureAnnotation = "id",
  fontcolor.feature = "white",
  background.title = "slategrey", # brown?
  background.panel = "transparent",
  fontsize.feature = 7,
  stackHeight = 1
)

Exploration of the functional changes, position of mutation in codon, and whether there are any potential over- or under-represented mutation types

# Consider reference base and whether there is a functional change
occurrences <- snvs_ins_del_clean %>%
  group_by(Type, Ref, Alt) %>%
  mutate(FunctionalChangePercent = 100 * (sum(FunctionalChange) / length(FunctionalChange))) %>%
  group_by(Position, Type, Ref, Alt, FunctionalChangePercent) %>%
  tally()


ggplot(occurrences %>% filter(Type == "SNV"), aes(x = factor(n))) +
  geom_histogram(stat = "count") +
  facet_wrap(~Ref) +
  theme(axis.text.x = element_text(size = 5)) +
  ggtitle("Number of times mutation observed, by reference base")

ggplot(occurrences %>% filter(Type == "SNV"), aes(x = factor(n), fill = Ref)) +
  geom_histogram(stat = "count") +
  facet_grid(Alt ~ Ref) +
  theme(axis.text.x = element_text(size = 5)) +
  ggtitle("Number of times mutation observed, by reference and alternate base")

occurrences_ratio <- snvs_ins_del_clean %>%
  mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
    yes = FALSE,
    no = TRUE
  )) %>%
  filter(Type == "SNV") %>%
  group_by(Ref, Alt, codon_position) %>%
  dplyr::count(FunctionalChange) %>%
  mutate(ratio = scales::percent(n / sum(n)))

ggplot(occurrences_ratio, aes(x = Ref, y = n, fill = FunctionalChange)) +
  geom_bar(stat = "identity", position = "fill") +
  geom_text(aes(y = n, label = ratio), position = position_fill(vjust = 0.5)) +
  facet_grid(Alt ~ codon_position, scales = "free_x") +
  ggtitle("Classification of functional change, by reference and alternate base")

occurrences_ratio_target_only <- snvs_ins_del_clean %>%
  mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
    yes = FALSE,
    no = TRUE
  )) %>%
  filter(Type == "SNV") %>%
  group_by(Ref, codon_position) %>%
  dplyr::count(FunctionalChange) %>%
  mutate(ratio = scales::percent(n / sum(n)))

ggplot(occurrences_ratio_target_only, aes(x = Ref, y = n, fill = FunctionalChange)) +
  geom_bar(stat = "identity", position = "fill") +
  geom_text(aes(y = n, label = ratio), position = position_fill(vjust = 0.5)) +
  facet_grid(. ~ codon_position, scales = "free_x") +
  ggtitle("Classification of functional change, by reference base, split by position in codon")

occurrences_ratio_target_only_chem <- snvs_ins_del_clean %>%
  mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
    yes = FALSE,
    no = TRUE
  )) %>%
  filter(Type == "SNV") %>%
  group_by(Ref, codon_position, Exposure) %>%
  dplyr::count(FunctionalChange) %>%
  mutate(ratio = scales::percent(n / sum(n)))

ggplot(occurrences_ratio_target_only_chem, aes(x = Ref, y = n, fill = FunctionalChange)) +
  geom_bar(stat = "identity", position = "fill") +
  geom_text(aes(y = n, label = ratio), position = position_fill(vjust = 0.5)) +
  facet_grid(Exposure ~ codon_position, scales = "free_x") +
  ggtitle("Classification of functional change, by reference base, split by position in codon")

occurrences_ratio %>%
  knitr::kable() %>%
  kableExtra::scroll_box(height = "480px") %>%
  kableExtra::kable_paper()
Ref Alt codon_position FunctionalChange n ratio
A C 1 TRUE 17 100%
A C 2 TRUE 32 100%
A C 3 FALSE 11 73%
A C 3 TRUE 4 27%
A G 1 TRUE 28 100%
A G 2 TRUE 79 100%
A G 3 FALSE 7 100%
A T 1 TRUE 70 100%
A T 2 TRUE 79 100%
A T 3 FALSE 3 18%
A T 3 TRUE 14 82%
C A 1 FALSE 1 1%
C A 1 TRUE 113 99%
C A 2 TRUE 257 100%
C A 3 FALSE 40 9%
C A 3 TRUE 401 91%
C G 1 TRUE 51 100%
C G 2 TRUE 71 100%
C G 3 FALSE 10 8%
C G 3 TRUE 114 92%
C T 1 FALSE 5 1%
C T 1 TRUE 668 99%
C T 2 TRUE 180 100%
C T 3 FALSE 17 100%
G A 1 TRUE 458 100%
G A 2 TRUE 378 100%
G A 3 FALSE 53 18%
G A 3 TRUE 236 82%
G C 1 TRUE 125 100%
G C 2 TRUE 121 100%
G C 3 FALSE 17 42%
G C 3 TRUE 23 57%
G T 1 TRUE 622 100%
G T 2 TRUE 233 100%
G T 3 FALSE 45 47.9%
G T 3 TRUE 49 52.1%
T A 1 TRUE 39 100%
T A 2 TRUE 114 100%
T A 3 FALSE 6 5%
T A 3 TRUE 123 95%
T C 1 TRUE 96 100%
T C 2 FALSE 1 3%
T C 2 TRUE 36 97%
T C 3 FALSE 7 100%
T G 1 TRUE 15 100%
T G 2 TRUE 34 100%
T G 3 FALSE 4 9%
T G 3 TRUE 40 91%

How many CpG sites are in lacZ? What is the overall GC content? How many of the CpG sites are accounted for in our data?

CpGs <- Biostrings::matchPattern(
  pattern = "CG",
  subject = lacz_mutamouse[[1]]
)
CpGs
## Views on a 3096-letter DNAString subject
## subject: ATGACCATGATTACGGATTCACTGGAATTCCCGG...TTACCAGTTGGTCTGGTGTCAAAAATAATAATAA
## views:
##         start  end width
##     [1]    14   15     2 [CG]
##     [2]    32   33     2 [CG]
##     [3]    42   43     2 [CG]
##     [4]    45   46     2 [CG]
##     [5]    55   56     2 [CG]
##     ...   ...  ...   ... ...
##   [287]  3029 3030     2 [CG]
##   [288]  3032 3033     2 [CG]
##   [289]  3048 3049     2 [CG]
##   [290]  3051 3052     2 [CG]
##   [291]  3055 3056     2 [CG]
CpGs %>% length()
## [1] 291
# 291 CpG sites in total in reference sequence

CpG_sites_ranges <- GRanges(
  seqnames = "lacZ",
  ranges = IRanges(
    start = start(ranges(CpGs)),
    end = end(ranges(CpGs))
  )
)

CpGs_in_data <- plyranges::find_overlaps(lacZ_granges, CpG_sites_ranges)
CpGs_in_data <- as.data.frame(CpGs_in_data) %>% dplyr::filter(Type == "SNV")

calculate_gc <- function(seq) {
  n_gc <- stringr::str_count(seq, "G")
  n_c <- stringr::str_count(seq, "C")
  pct_gc <- ((n_gc + n_c) / stringr::str_length(seq))
  return(pct_gc)
}

calculate_gc(as.character(unlist(lacz_mutamouse[[1]])))
## [1] 0.5613695
# How many of the CpG sites are accounted for?
CpGs_in_data %>%
  dplyr::group_by(start) %>%
  tally() %>%
  nrow()
## [1] 276
# 276 of the CpG sites have mutations

Known amino acid changes based on literature review

existing <- read.table("./data/raw/21_existing_amino_acid_changes.txt",
  header = T,
  sep = "\t"
)

# Rewrite position as relative to lacZ MutaMouse sequence
# Get index of pre-insertion
index <- existing["nuc_start"] < 26
# Transform by subtracting 3
# There is nothing in this range, but let's do it anyway (accounts for MutaMouse insertion)
existing$ref_nuc_start[index] <- existing$nuc_start[index] - 3
# do the same for nuc end
index <- existing["nuc_end"] < 26
existing$ref_nuc_end[index] <- existing$nuc_end[index] - 3
# Get index post-insertion
index <- existing["nuc_start"] > 40
# Subtract 18
existing$ref_nuc_start[index] <- existing$nuc_start[index] - 18
# do the same for nuc end
index <- existing["nuc_end"] > 40
existing$ref_nuc_end[index] <- existing$nuc_end[index] - 18

existing <- existing %>% tidyr::unite("ref_nuc_range", ref_nuc_start:ref_nuc_end, sep = " - ", remove = F)

write.table(existing,
  file = "data/processed/21_existing_amino_acid_changes_coordinates_lifted.txt",
  quote = F,
  sep = "\t",
  row.names = F
)

Various numbers on composition of data used in Beal et al 2021

# Number of "independent mutations"
# This means: how many mutations (rows) were there in the dataset?
# This can include some duplicates across studies/chemicals/samples.
# Important to include as a basis for hotspot analysis.
snvs_ins_del_clean %>%
  count() # There are 6,465
## # A tibble: 1 × 1
##       n
##   <int>
## 1  6465
# Missense mutations impairing B-gal
snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  count() # There are 2,732
## # A tibble: 1 × 1
##       n
##   <int>
## 1  2732
# Nonsense mutations
snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "nonsense") %>%
  count() # There are 2,206
## # A tibble: 1 × 1
##       n
##   <int>
## 1  2206
# Silent
snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "silent") %>%
  count() # There are 227
## # A tibble: 1 × 1
##       n
##   <int>
## 1   227
# SNVs
snvs_ins_del_clean %>%
  dplyr::filter(Type == "SNV") %>%
  count() # There are 5,147
## # A tibble: 1 × 1
##       n
##   <int>
## 1  5147
# Missense - this specifically means SNVs
# Put another way:
# How many times did we disrupt the coding sequence of the protein?
Fig1 <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
  dplyr::group_by(Codon) %>%
  tally()

# Supplementary materials - how many unique SNVs?
snvs_ins_del_clean %>%
  dplyr::filter(Type == "SNV") %>%
  group_by(Position, Ref, Alt) %>%
  count()
## # A tibble: 1,399 × 4
## # Groups:   Position, Ref, Alt [1,399]
##    Position Ref   Alt       n
##       <dbl> <chr> <chr> <int>
##  1        4 A     C         1
##  2        6 C     A         1
##  3       19 T     C         3
##  4       20 C     A         2
##  5       20 C     G         2
##  6       21 A     C         3
##  7       25 G     T         6
##  8       26 A     C         1
##  9       27 A     C         1
## 10       27 A     T         1
## # ℹ 1,389 more rows
# There are 1,399

# Of the 1,399, how many mutations does that refer to?
snvs_ins_del_clean %>%
  dplyr::filter(Type == "SNV") %>%
  group_by(Position, Ref, Alt) %>%
  count() %>%
  pull(n) %>%
  sum()
## [1] 5147
# There are 5,147

# How many unique SNV missense mutations?
snvs_ins_del_clean %>%
  dplyr::filter(Type == "SNV") %>%
  dplyr::filter(Consequence == "missense") %>%
  group_by(Position, Ref, Alt) %>%
  count() # There are 895
## # A tibble: 895 × 4
## # Groups:   Position, Ref, Alt [895]
##    Position Ref   Alt       n
##       <dbl> <chr> <chr> <int>
##  1        4 A     C         1
##  2       19 T     C         3
##  3       26 A     C         1
##  4       27 A     C         1
##  5       27 A     T         1
##  6       29 T     G         1
##  7       30 C     G         1
##  8       41 C     A         1
##  9       51 A     C         1
## 10       52 C     A         2
## # ℹ 885 more rows
# How many residues are impacted by a functional change in all the data?
snvs_ins_del_clean %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(FunctionalChange == 1) %>%
  tally() %>%
  count() # There are 633
## # A tibble: 1 × 1
##       n
##   <int>
## 1   633
# How many codons identified by Sanger?
sanger <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
  dplyr::group_by(Codon) %>%
  dplyr::filter(Technology == "Sanger")
sanger %>% tally()
## # A tibble: 266 × 2
##    Codon     n
##    <dbl> <int>
##  1    15     1
##  2    19     1
##  3    25     1
##  4    30     1
##  5    47     1
##  6    54     1
##  7    65     1
##  8    67     1
##  9    73     2
## 10    81     1
## # ℹ 256 more rows
# How many codons identified by NGS?
ngs <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
  dplyr::group_by(Codon) %>%
  dplyr::filter(Technology == "NGS")
ngs %>% tally()
## # A tibble: 384 × 2
##    Codon     n
##    <dbl> <int>
##  1     2     1
##  2     7     3
##  3     8     6
##  4    11     1
##  5    12     2
##  6    16     1
##  7    24     1
##  8    31     2
##  9    32     1
## 10    34     1
## # ℹ 374 more rows
nrow(Fig1)
## [1] 492
nrow(Fig1) / length(laczref_aa[[1]])
## [1] 0.4795322
ggplot(Fig1, aes(x = Codon, y = n)) +
  geom_bar(stat = "identity") +
  theme_bw()

# How many codons in spontaneous mutants?
controls <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
  dplyr::group_by(Codon) %>%
  dplyr::filter(Exposure == "Control")
nrow(controls)
## [1] 859
n_groups(controls) # There are 204 missense mutations found in controls
## [1] 204
group_size(controls)
##   [1]  1  1  1  1  1  1  1  4  1  1  2  1  1  2  1  1  2  2  1  2  1  9  1  2  1
##  [26]  1  3  1  1  1  1  1  1  1  7 26  1 23  1  1  1  1  2  1  1  1  1  1  2 19
##  [51]  2  2  2  1  1  1  7  1  2  1  1 54  1  1  1  1  2  1  1  1  4  1 67  3 23
##  [76]  1  1  1  2  1  9  1  5  2  2  3  2  1  1  2  9  1  1  1  1  1  1  8 10  1
## [101]  5  1  1  1  5  2  1  1  1  1  1  1  5  3  5  2  7  1  1  1  1  2  1  2 59
## [126]  5  2  1  3  3  6  4  6  1  1 10  4 10  1  1  1  5  1  2 31  3  1  1  1  1
## [151]  1  1  1  1  1  1  2  2  1  1  1  3 54  2 16  1  1  1  1  1  1  1  1  1 28
## [176]  1  1  2  1 25  1  4  1  7 47  1  1  1  5  4  1  5  2  3  1  1  2  1  1  1
## [201]  2  2  1  1
# How many codons in mutagen exposed samples?
non_controls <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(!Exposure == "Control")
nrow(non_controls)
## [1] 1873
n_groups(non_controls) # 439 missense mutations in non-controls
## [1] 439
group_size(non_controls)
##   [1]  1  2  5  1  2  1  1  1  1  2  1  1  1  3  1  1  2  2  2  1  1  1  3  1  2
##  [26]  2  4  3  1  1  2  1  1  1  1  3  2  2  1  2  1  1  1  1  4  1  2  1  1  1
##  [51]  3  4  5  1  1  1  1  2  1  1  2  5  8  1  8  9  1  2  3  1  3  1  3  1  2
##  [76]  4  2  2  2  2  1  2  5  2  3 29 11  7  1  6 33  1 13  2  4  1  2  2  1  1
## [101]  1  1  1  1  1  1  1  1  2  1  3  3  1  3  1  1  1  1  2  2  4  2  1  1  1
## [126]  2  1  1  1 19  2  1 13  1  1  1  1  1  1  1  4  1 13 16  2  1  1  2  2 16
## [151]  1  6  9  4 18  1  1  1  1  1  1  1  8  3  2  1  1  1  8  2  8  2 33 20 16
## [176]  1  1  1  1  9  3  6 20  2  8  7  1 15  1  8  3  1  1  4  1  1  2  4  1  3
## [201] 24  1  1  1  5  5  1  1  8 15 17 12  6  8  2  1  1  1  1  4  1  1  5  2  2
## [226]  3  1  1  1  2  2  1  1 18 17 21  5  9  2  1  1  3  1  1  1  1  7  7  2  1
## [251] 30  1  1 62 11  2  4 14 13  4 24  1  2  2  1  1  1  1  1  2 17 20  2 44  3
## [276]  8  3  1  1  4  3  2  1  2  3  6  3  2 28  3  1  1  2  3  1  2  1  1  1  1
## [301]  1  3  2  1  2  1  1  1  2  1  2  1  1  1  2  1  1  1  1  5  2  1  1  1  2
## [326]  2  2  1  1  1  1  1  1  1  2  1  1  1  2  4 88  2  2  1 18 33  2  1  2  4
## [351]  2  1  3  1  1  1  1  2  2  2  3  1  1  1  1  1 13 25  1  6  2  1  1  1  1
## [376]  3  4  3 42 15  1  1 10  2  2 16 44  2  1  1  1  1  2  1 10 13  3  7 11  2
## [401] 16  1  1 20  2  1  2  1  1  7 12  5  2  1  1  1  2  1  1  1  1  1  1  3  1
## [426]  3  7  5  1  2  2  1  1  2  1  1  1  1  1
# Codons common to controls and experimental samples
# There are 151 overlapping
aa_inboth_ctrl_exp <- dplyr::intersect(
  controls %>% dplyr::pull(Codon),
  non_controls %>% dplyr::pull(Codon)
)

dplyr::left_join(
  controls %>% dplyr::select(Codon),
  non_controls %>% dplyr::select(Codon)
) %>%
  distinct() %>%
  pull()
##   [1]    7   55   62   77   91  124  146  148  154  194  201  207  272  301  304
##  [16]  323  331  353  354  355  357  358  375  379  387  388  389  390  391  393
##  [31]  403  406  412  414  418  419  439  440  457  459  460  461  462  463  488
##  [46]  500  502  503  505  532  535  537  540  541  542  543  544  545  546  547
##  [61]  564  565  568  570  589  602  605  640  652  670  673  691  786  791  792
##  [76]  806  832  881  890  897  898  899  901  908  909  914  932  933  935  938
##  [91]  942  950  951  993  994  134  135  203  449  651  770  921  324  430  487
## [106]  501  528  611  775  810  952    8  181  313  386  592  763  404  112  166
## [121]  504  561   73  569  485  416  782  210  497  573  863  105  190  360  489
## [136]  169  436  780  934  800   19  145  446  200  206  254  268  295  359  362
## [151]  453  622  872  245  624  773  796  883  941  997  448  149  193  294  296
## [166]  299  352  366  395  405  452  474  492  498  509  515  557  712  733  797
## [181]  398  407  925  946  116  533  904  422  198  356  900   47   54  100  139
## [196]  164  328  986  165  281  852 1022   25  524
dplyr::right_join(
  controls %>% dplyr::select(Codon),
  non_controls %>% dplyr::select(Codon)
) %>%
  distinct() %>%
  pull()
##   [1]    7   62   91  124  146  148  194  201  207  272  301  304  331  353  354
##  [16]  355  357  358  375  387  388  389  390  391  393  403  406  412  414  418
##  [31]  419  439  457  459  460  461  462  463  488  500  502  503  505  532  537
##  [46]  540  541  542  543  544  545  546  547  564  565  568  570  589  602  605
##  [61]  640  652  670  691  786  791  792  806  881  890  897  898  899  901  908
##  [76]  909  932  933  935  938  942  951  993  994  134  135  203  449  324  430
##  [91]  487  501  528  611  810  952    8  313  386  592  763  404  504  561   73
## [106]  569  485  416  210  497  573  489  436  780  934  145  446  200  206  254
## [121]  295  359  453  622  883  941  997  149  193  296  299  452  498  509  557
## [136]  712  398  533  904  422  198  356   47   54  100  139  328  986  165  852
## [151]  524   11   31   32   35   38   40   44   46   49   50   52   53   65   67
## [166]   70   71   82   84   87   96   97   98  103  119  120  121  137  138  152
## [181]  155  160  162  168  173  184  185  186  199  204  205  208  216  238  251
## [196]  255  262  269  270  298  302  303  310  319  333  336  345  346  372  380
## [211]  381  396  400  402  441  451  464  469  482  490  495  507  512  513  538
## [226]  553  590  599  610  616  623  628  703  705  707  716  722  725  736  750
## [241]  757  777  779  781  785  787  788  805  812  841  879  880  886  889  892
## [256]  902  905  907  912  928  930  936  939  947  949  953  954  961  970  976
## [271]  981  987  995  999 1001 1005 1017 1020    2   12   34   94  104  211  267
## [286]  287  288  361  499  687  688  692  809  815  823  854  966  967  998   16
## [301]   74   88  212  217  244  256  258  259  365  567  603  604  606  778  817
## [316]  840  884  895  991 1000  224  326  749  774  882  102  226  279  363  522
## [331]  608  634  984   45  236  261  466  521  702  740  741  748  311  494  555
## [346]  745  931  937  948   99  101  626  847   43  192  325  420  496 1015   24
## [361]  180  197  465  539  549  594  597  629  916  943  698 1014  370  242  215
## [376]  342  776  228   30   86  118  161  344  423  433  468  534  572  587  708
## [391]  790  826  383  122  399  417  520  552  919  929  958  974 1002   15   81
## [406]  221  364  554  559  766  548  411  859  213  222  315  385  415  425  429
## [421]  456  607  662  709  819  870  129  147  264  384  421  484  676  874  940
## [436]  903  836  280  699
# How many codons in spontaneous mutants excluding those found in mutagen treated samples?
# This calculation only includes missense!
control_aa <- controls %>%
  dplyr::select(Codon) %>%
  pull()
non_control_aa <- non_controls %>%
  dplyr::select(Codon) %>%
  pull()
control_aa[!control_aa %in% non_control_aa] %>% unique()
##  [1]   55   77  154  323  379  440  535  673  832  914  950  651  770  921  775
## [16]  181  112  166  782  863  105  190  360  169  800   19  268  362  872  245
## [31]  624  773  796  448  294  352  366  395  405  474  492  515  733  797  407
## [46]  925  946  116  900  164  281 1022   25
controls %>% filter(!Codon %in% (non_controls %>% pull(Codon))) # Another way to calculate it
## # A tibble: 63 × 41
## # Groups:   Codon [53]
##    Exposure Tissue       Dose Position Ref   Alt   `Tech Rep1` `Tech Rep2`
##    <chr>    <chr>       <dbl>    <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 Control  Bone Marrow     0      182 A     C          0.0327      0.0239
##  2 Control  Bone Marrow     0      247 G     T          0.0376      0.036 
##  3 Control  Bone Marrow     0      478 T     C          0.0106      0.011 
##  4 Control  Bone Marrow     0      478 T     C          0.0396      0.0453
##  5 Control  Bone Marrow     0      986 T     A          0.136       0.128 
##  6 Control  Bone Marrow     0     1154 T     G          0.0043      0.0042
##  7 Control  Bone Marrow     0     1336 G     T          0.0249      0.0253
##  8 Control  Bone Marrow     0     1622 T     G          0.0085      0.0089
##  9 Control  Bone Marrow     0     1622 T     G          0.0337      0.0319
## 10 Control  Bone Marrow     0     2035 G     A          0.0503      0.0681
## # ℹ 53 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## #   `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## #   `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## #   `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## #   Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## #   `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# There are 63 of these:
controls_only_missense <- controls %>% filter(!Codon %in% (non_controls %>% pull(Codon)))

Numbers used, hotspots, by nucleotide

# Hotspot analysis
snvs_only <- snvs_ins_del_clean %>%
  dplyr::filter(Type == "SNV") # %>% dplyr::filter(Consequence == "missense")
# If not limited to missense, then nonsense mutations, etc., are included;
# but all consequences are important for counting where hotspots are.'
snvs_only %>% count() # Same as above, 5,147
## # A tibble: 1 × 1
##       n
##   <int>
## 1  5147
counts_per_nucleotide <- snvs_only %>%
  dplyr::group_by(Position) %>%
  dplyr::tally() %>%
  pull(n)
mean(counts_per_nucleotide)
## [1] 4.892586
sd(counts_per_nucleotide)
## [1] 8.932928
mean(counts_per_nucleotide) + sd(counts_per_nucleotide)
## [1] 13.82551
hotspot_cutoff_value <- mean(counts_per_nucleotide) + sd(counts_per_nucleotide)

hotspots <- snvs_only %>%
  dplyr::group_by(Position) %>%
  dplyr::add_count() %>%
  dplyr::ungroup() %>%
  dplyr::filter(n >= hotspot_cutoff_value) # changed from 10 to 14
# mean number of mutations per nucleotide position
hotspots %>% count() # There are 1,256 mutations at missense hotspots (i.e., SNV and missense mutations that is)
## # A tibble: 1 × 1
##       n
##   <int>
## 1  2069
# There are 2,069 when including all consequences
hotspots %>%
  dplyr::group_by(Position) %>%
  tally() %>%
  arrange(-n)
## # A tibble: 74 × 2
##    Position     n
##       <dbl> <int>
##  1     1187   112
##  2     2374   106
##  3     1627    91
##  4     1072    90
##  5     1090    81
##  6     2713    56
##  7     1831    48
##  8     2743    46
##  9     2744    45
## 10      928    44
## # ℹ 64 more rows
# There are 74 hotspots when including all consequences
# There are 39 missense hotspot nucleotides with a count >13 independent mutations
# 136 if using a cutoff of 10 as done originally without Sanger data

ranked_hotspots_all <- hotspots %>%
  dplyr::group_by(Position) %>%
  tally() %>%
  arrange(-n)

# ranked_hotspots_ngs$Position <- factor(ranked_hotspots_ngs$Position,
#                                    levels = ranked_hotspots_ngs$Position)
#
# ggplot(ranked_hotspots_ngs %>% head(n = 20), aes(x = Position, y = n, group = Position)) +
#   geom_bar(stat = "identity") +
#   theme(axis.text.x = element_text(angle = 90))

ranked_hotspots_all$Position <- factor(ranked_hotspots_all$Position,
  levels = ranked_hotspots_all$Position
)

ggplot(ranked_hotspots_all %>% head(n = 20), aes(x = Position, y = n, group = Position)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90))

Hotspots - how many are at CpGs?

hs <- hotspots$Position %>% unique()

hotspot_ranges <- GRanges(
  seqnames = "lacZ",
  ranges = IRanges(
    start = start(ranges(IRanges(
      start = hs,
      end = hs
    ))),
    end = end(ranges(IRanges(
      start = hs,
      end = hs
    )))
  )
)

CpGs_in_hotspots <- plyranges::find_overlaps(hotspot_ranges, CpG_sites_ranges)
CpGs_in_hotspots <- as.data.frame(CpGs_in_hotspots) # %>% dplyr::filter(Type == "SNV")
CpGs_in_hotspots
##    seqnames start  end width strand
## 1      lacZ   461  461     1      *
## 2      lacZ   637  637     1      *
## 3      lacZ   928  928     1      *
## 4      lacZ  1016 1016     1      *
## 5      lacZ  1090 1090     1      *
## 6      lacZ  1187 1187     1      *
## 7      lacZ  1196 1196     1      *
## 8      lacZ  1334 1334     1      *
## 9      lacZ  1520 1520     1      *
## 10     lacZ  1627 1627     1      *
## 11     lacZ  1638 1638     1      *
## 12     lacZ  1831 1831     1      *
## 13     lacZ  2374 2374     1      *
## 14     lacZ  2375 2375     1      *
## 15     lacZ  2392 2392     1      *
## 16     lacZ  2659 2659     1      *
## 17     lacZ  2713 2713     1      *
## 18     lacZ  2740 2740     1      *
## 19     lacZ  2743 2743     1      *
## 20     lacZ  2744 2744     1      *
## 21     lacZ  2813 2813     1      *
## 22     lacZ  2817 2817     1      *
## 23     lacZ  2835 2835     1      *
## 24     lacZ  2840 2840     1      *
## 25     lacZ   136  136     1      *
## 26     lacZ   187  187     1      *
## 27     lacZ   303  303     1      *
## 28     lacZ   436  436     1      *
## 29     lacZ   501  501     1      *
## 30     lacZ   759  759     1      *
## 31     lacZ  1018 1018     1      *
## 32     lacZ  1224 1224     1      *
## 33     lacZ  1233 1233     1      *
## 34     lacZ  1342 1342     1      *
## 35     lacZ  1388 1388     1      *
## 36     lacZ  1527 1527     1      *
## 37     lacZ  1626 1626     1      *
## 38     lacZ  1739 1739     1      *
## 39     lacZ  1775 1775     1      *
## 40     lacZ  1782 1782     1      *
## 41     lacZ  2266 2266     1      *
## 42     lacZ  2473 2473     1      *
## 43     lacZ  2805 2805     1      *
## 44     lacZ  3029 3029     1      *
## 45     lacZ  1072 1072     1      *
hotspot_cpg_muts <- snvs_ins_del_clean %>% dplyr::filter(Position %in% CpGs_in_hotspots$start)
hotspot_cpg_muts
## # A tibble: 1,566 × 41
##    Exposure Tissue       Dose Position Ref   Alt   `Tech Rep1` `Tech Rep2`
##    <chr>    <chr>       <dbl>    <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 BaP      Bone Marrow   100      461 C     G          0.0043      0.0047
##  2 BaP      Bone Marrow   100      461 C     G          0.007       0.0084
##  3 BaP      Bone Marrow   100      461 C     G          0.0029      0.0023
##  4 BaP      Bone Marrow   100      637 G     T          0.0021      0.0021
##  5 BaP      Bone Marrow   100      637 G     T          0.0029      0.0026
##  6 BaP      Bone Marrow   100      637 G     T          0.0075      0.0083
##  7 BaP      Bone Marrow   100      637 G     C          0.0026      0.0022
##  8 BaP      Bone Marrow   100      637 G     C          0.0029      0.0025
##  9 BaP      Bone Marrow   100      637 G     C          0.0026      0.0021
## 10 BaP      Bone Marrow   100      637 G     C          0.0023      0.0031
## # ℹ 1,556 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## #   `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## #   `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## #   `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## #   Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## #   `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# There are 45 hotspots that overlap with CpG sites

nrow(CpGs_in_hotspots) / length(hs)
## [1] 0.6081081

Numbers - NGS technology hotspots only (not included in study)

# Hotspots in NGS data
snvs_only_ngs <- snvs_ins_del_clean %>%
  dplyr::filter(Type == "SNV") %>%
  dplyr::filter(Technology == "NGS")

hotspots_ngs <- snvs_only_ngs %>%
  dplyr::group_by(Position) %>%
  dplyr::add_count() %>%
  dplyr::filter(n >= hotspot_cutoff_value)

hotspots_ngs %>%
  dplyr::group_by(Position) %>%
  tally()
## # A tibble: 52 × 2
##    Position     n
##       <dbl> <int>
##  1      303    20
##  2      421    17
##  3      461    26
##  4      501    39
##  5      610    14
##  6      637    18
##  7      638    18
##  8      928    29
##  9     1016    15
## 10     1018    22
## # ℹ 42 more rows
hotspots_ngs %>%
  group_by(Position, `Ref Codon`) %>%
  tally() %>%
  group_by(`Ref Codon`) %>%
  tally()
## # A tibble: 18 × 2
##    `Ref Codon`     n
##    <chr>       <int>
##  1 CAC             2
##  2 CAG             1
##  3 CCG             2
##  4 CGA             2
##  5 CGC             4
##  6 CGG             1
##  7 CGT             2
##  8 GAA             5
##  9 GAC             2
## 10 GAG             4
## 11 GGA             2
## 12 GGC             6
## 13 GGG             1
## 14 GGT             1
## 15 TAC             8
## 16 TCG             5
## 17 TGC             2
## 18 TGG             2
ranked_hotspots_ngs <- hotspots_ngs %>%
  dplyr::group_by(Position) %>%
  tally() %>%
  arrange(-n)

counts_per_nucleotide <- snvs_only_ngs %>%
  dplyr::group_by(Position) %>%
  dplyr::tally() %>%
  pull(n)
mean(counts_per_nucleotide)
## [1] 4.235028
sd(counts_per_nucleotide)
## [1] 5.856226
mean(counts_per_nucleotide) + sd(counts_per_nucleotide)
## [1] 10.09125

Numbers used for hotspot analysis, aggregated by codon

# 692 codons mutated in total
snvs_ins_del_clean %>%
  dplyr::group_by(Codon) %>%
  tally()
## # A tibble: 692 × 2
##    Codon     n
##    <dbl> <int>
##  1     2     2
##  2     7    10
##  3     8    16
##  4    11     5
##  5    12     9
##  6    15     1
##  7    16    19
##  8    17     2
##  9    19     3
## 10    23    10
## # ℹ 682 more rows
# 492 codons with missense mutations
snvs_ins_del_clean %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(Consequence == "missense") %>%
  tally()
## # A tibble: 492 × 2
##    Codon     n
##    <dbl> <int>
##  1     2     1
##  2     7     3
##  3     8     6
##  4    11     1
##  5    12     2
##  6    15     1
##  7    16     1
##  8    19     1
##  9    24     1
## 10    25     1
## # ℹ 482 more rows
# 605 codons mutated in NGS data
snvs_only_ngs %>%
  dplyr::group_by(Codon) %>%
  tally()
## # A tibble: 605 × 2
##    Codon     n
##    <dbl> <int>
##  1     2     2
##  2     7     8
##  3     8    14
##  4    11     3
##  5    12     6
##  6    16    13
##  7    17     2
##  8    19     2
##  9    23     5
## 10    24     1
## # ℹ 595 more rows
# 384 codons with missense mutations in NGS data
snvs_only_ngs %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(Consequence == "missense") %>%
  tally()
## # A tibble: 384 × 2
##    Codon     n
##    <dbl> <int>
##  1     2     1
##  2     7     3
##  3     8     6
##  4    11     1
##  5    12     2
##  6    16     1
##  7    24     1
##  8    31     2
##  9    32     1
## 10    34     1
## # ℹ 374 more rows
# 266 codons identified by Sanger
sanger %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(Consequence == "missense") %>%
  tally()
## # A tibble: 266 × 2
##    Codon     n
##    <dbl> <int>
##  1    15     1
##  2    19     1
##  3    25     1
##  4    30     1
##  5    47     1
##  6    54     1
##  7    65     1
##  8    67     1
##  9    73     2
## 10    81     1
## # ℹ 256 more rows
# How many codons found with both technologies?
dplyr::inner_join(
  sanger %>% dplyr::select(Codon),
  ngs %>% dplyr::select(Codon)
) %>%
  distinct() # 158
## # A tibble: 158 × 1
## # Groups:   Codon [158]
##    Codon
##    <dbl>
##  1   148
##  2   391
##  3   540
##  4   880
##  5   881
##  6   941
##  7   994
##  8   355
##  9   786
## 10   908
## # ℹ 148 more rows
# Calculate mutations per codon
counts_per_codon <- snvs_ins_del_clean %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::tally() %>%
  pull(n)
mean(counts_per_codon)
## [1] 5.552846
sd(counts_per_codon)
## [1] 13.07067
mean(counts_per_codon) + sd(counts_per_codon)
## [1] 18.62352
hotspot_cutoff_value_codon <- mean(counts_per_codon) + sd(counts_per_codon)

# Codon hotspots

# NGS only - not used in study
hotspots_ngs_codons <- snvs_only_ngs %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::add_count() %>%
  dplyr::ungroup() %>%
  dplyr::filter(n >= hotspot_cutoff_value_codon)

hotspots_ngs_codons %>%
  dplyr::group_by(Codon) %>%
  tally()
## # A tibble: 24 × 2
##    Codon     n
##    <dbl> <int>
##  1   201    28
##  2   207    36
##  3   301    19
##  4   304    19
##  5   353    20
##  6   358    21
##  7   390    31
##  8   393    19
##  9   406    26
## 10   439    22
## # ℹ 14 more rows
ranked_hotspots_codons <- hotspots_ngs_codons %>%
  dplyr::group_by(Codon) %>%
  tally() %>%
  arrange(-n)

ranked_hotspots_codons$Codon <- factor(ranked_hotspots_codons$Codon,
  levels = ranked_hotspots_codons$Codon
)

codons_of_interest <- ranked_hotspots_codons %>%
  head(n = 37) %>%
  pull(Codon)

ggplot(ranked_hotspots_codons %>% head(n = 37), aes(x = Codon, y = n, group = Codon)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90))

# Codon hotspots - all
hotspots_codons <- snvs_ins_del_clean %>%
  # dplyr::filter(Type=="SNV") %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::group_by(Codon) %>%
  dplyr::add_count() %>%
  dplyr::ungroup() %>%
  dplyr::filter(n >= hotspot_cutoff_value_codon)

hotspots_codons_2_independent <- snvs_ins_del_clean %>%
  # dplyr::filter(Type=="SNV") %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::filter(FunctionalChange == 1) %>%
  dplyr::group_by(Codon) %>%
  dplyr::add_count() %>%
  dplyr::filter(n >= 2)

# In all the data, how many missense mutations affect
hotspots_codons_2_independent %>%
  group_by(Codon) %>%
  tally()
## # A tibble: 264 × 2
##    Codon     n
##    <dbl> <int>
##  1     7     3
##  2     8     6
##  3    12     2
##  4    31     2
##  5    38     3
##  6    44     2
##  7    45     2
##  8    46     2
##  9    47     2
## 10    52     3
## # ℹ 254 more rows
codons_to_visualize <- hotspots_codons_2_independent %>%
  pull(Codon) %>%
  unique()
codons_to_visualize
##   [1]    7    8   31   38   44   46   52   54   62   65   67   87  119  120  121
##  [16]  139  145  148  149  155  160  162  168  184  185  186  193  194  199  200
##  [31]  201  203  204  206  207  210  216  251  254  255  269  270  272  295  301
##  [46]  302  304  328  331  333  336  345  346  353  355  356  357  358  375  380
##  [61]  381  386  387  388  389  390  391  393  402  403  404  406  412  418  419
##  [76]  436  439  446  451  452  459  461  463  464  482  489  490  498  501  502
##  [91]  504  505  507  509  513  528  532  537  540  541  543  544  545  547  564
## [106]  565  568  569  570  573  589  590  599  605  610  611  622  640  670  705
## [121]  716  722  777  780  781  785  786  787  788  791  792  805  812  841  852
## [136]  880  881  899  901  905  907  908  909  912  930  932  933  934  935  936
## [151]  938  941  942  947  952  953  954  987  994  995  999   12   91   94  146
## [166]  211  324  449  546  691  809  810  883  890  897  967  997  998  124  154
## [181]  323  354  414  457  460  462  488  500  503  535  542  602  652  806  898
## [196]  950  951  993   88  100  198  212  256  259  359  416  487  533  567  603
## [211]  604  606  817  840  884  895  904   47  326  422   73  102  165  279  299
## [226]  398  561  763   45  741  748  134  135  770  712  745  937  430  524  847
## [241]  192  497  313  592  180  197  549  597  557  698  485  215  112  166  782
## [256]  453  296  986   86  572  552 1002  411  429
codons_to_visualize %>% length()
## [1] 264
# There are 263... or 264 if you look at missense not limited to SNVs
paste(codons_to_visualize, collapse = "+") # For PyMol
## [1] "7+8+31+38+44+46+52+54+62+65+67+87+119+120+121+139+145+148+149+155+160+162+168+184+185+186+193+194+199+200+201+203+204+206+207+210+216+251+254+255+269+270+272+295+301+302+304+328+331+333+336+345+346+353+355+356+357+358+375+380+381+386+387+388+389+390+391+393+402+403+404+406+412+418+419+436+439+446+451+452+459+461+463+464+482+489+490+498+501+502+504+505+507+509+513+528+532+537+540+541+543+544+545+547+564+565+568+569+570+573+589+590+599+605+610+611+622+640+670+705+716+722+777+780+781+785+786+787+788+791+792+805+812+841+852+880+881+899+901+905+907+908+909+912+930+932+933+934+935+936+938+941+942+947+952+953+954+987+994+995+999+12+91+94+146+211+324+449+546+691+809+810+883+890+897+967+997+998+124+154+323+354+414+457+460+462+488+500+503+535+542+602+652+806+898+950+951+993+88+100+198+212+256+259+359+416+487+533+567+603+604+606+817+840+884+895+904+47+326+422+73+102+165+279+299+398+561+763+45+741+748+134+135+770+712+745+937+430+524+847+192+497+313+592+180+197+549+597+557+698+485+215+112+166+782+453+296+986+86+572+552+1002+411+429"
hotspots_codons %>%
  dplyr::group_by(Codon) %>%
  tally()
## # A tibble: 33 × 2
##    Codon     n
##    <dbl> <int>
##  1   201    36
##  2   203    37
##  3   207    56
##  4   301    21
##  5   304    32
##  6   353    23
##  7   358    72
##  8   390   100
##  9   391    23
## 10   393    39
## # ℹ 23 more rows
ranked_hotspots_codons_all <- hotspots_codons %>%
  dplyr::group_by(Codon) %>%
  tally() %>%
  arrange(-n)

ranked_hotspots_codons_all$Codon <- factor(ranked_hotspots_codons_all$Codon,
  levels = ranked_hotspots_codons_all$Codon
)

# 33 codons of interest, top ranked hot spots
codons_of_interest <- ranked_hotspots_codons_all %>%
  head(n = 33) %>%
  pull(Codon)

codons_of_interest[codons_of_interest %in% existing$aa_pos]
## [1] 537 540 568 201 503 391
## 33 Levels: 786 390 909 537 358 540 899 605 207 568 881 792 393 203 201 ... 901
existing[existing$aa_pos %in% codons_of_interest, ]
##    nuc_start nuc_end wt_aa aa_pos ref_nuc_range ref_nuc_start ref_nuc_end
## 2        619     621   Asp    201     601 - 603           601         603
## 4       1189    1191   His    391   1171 - 1173          1171        1173
## 9       1525    1527   Tyr    503   1507 - 1509          1507        1509
## 11      1627    1629   Glu    537   1609 - 1611          1609        1611
## 12      1636    1638   His    540   1618 - 1620          1618        1620
## 14      1720    1722   Trp    568   1702 - 1704          1702        1704
paste(codons_of_interest, collapse = "+") # For PyMol
## [1] "786+390+909+537+358+540+899+605+207+568+881+792+393+203+201+439+304+547+406+564+503+459+565+353+391+501+908+941+301+502+791+545+901"
ggplot(ranked_hotspots_codons_all %>% head(n = 33), aes(x = Codon, y = n, group = Codon)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90))

hotspots_codons_summary <- hotspots_codons %>%
  dplyr::group_by(residue_name, Domain) %>%
  tally() %>%
  arrange(Domain, -n)
hotspots_codons_summary
## # A tibble: 34 × 3
## # Groups:   residue_name [34]
##    residue_name Domain                      n
##    <chr>        <chr>                   <int>
##  1 Gly207       Sugar Binding (PF02837)    56
##  2 Trp203       Sugar Binding (PF02837)    37
##  3 Asp201       Sugar Binding (PF02837)    36
##  4 Ser390       TIM Barrel (PF02836)      100
##  5 Glu537       TIM Barrel (PF02836)       89
##  6 Glu358       TIM Barrel (PF02836)       72
##  7 His540       TIM Barrel (PF02836)       67
##  8 Gly605       TIM Barrel (PF02836)       59
##  9 Trp568       TIM Barrel (PF02836)       54
## 10 Pro393       TIM Barrel (PF02836)       39
## # ℹ 24 more rows
write.table(hotspots_codons_summary,
  file = "data/processed/table2.txt",
  quote = F, sep = "\t", row.names = F
)

table_S13 <- snvs_ins_del_clean %>%
  # dplyr::filter(Type=="SNV") %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::group_by(Codon) %>%
  dplyr::add_count() %>%
  dplyr::ungroup() %>%
  dplyr::group_by(residue_name, Domain) %>%
  tally() %>%
  arrange(Domain, -n)
write.table(table_S13,
  file = "data/processed/table_s13.txt",
  quote = F, sep = "\t", row.names = F
)

Look at specific amino acid residues and their abundance in the data

# Glutamine 537 example
snvs_ins_del_clean %>%
  filter(Codon == 537) %>%
  filter(Consequence == "missense")
## # A tibble: 89 × 41
##    Exposure Tissue       Dose Position Ref   Alt   `Tech Rep1` `Tech Rep2`
##    <chr>    <chr>       <dbl>    <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 BaP      Bone Marrow   100     1627 G     A          0.0101      0.01  
##  2 BaP      Bone Marrow   100     1627 G     C          0.0026      0.0025
##  3 BaP      Bone Marrow   100     1627 G     C          0.0029      0.0026
##  4 BaP      Bone Marrow   100     1627 G     C          0.0031      0.0034
##  5 BaP      Bone Marrow   100     1627 G     A          0.0047      0.004 
##  6 Control  Bone Marrow     0     1627 G     A          0.746       0.746 
##  7 Control  Bone Marrow     0     1627 G     A          0.272       0.276 
##  8 Control  Bone Marrow     0     1627 G     A          0.0298      0.0305
##  9 Control  Bone Marrow     0     1627 G     A          0.171       0.178 
## 10 Control  Bone Marrow     0     1627 G     A          0.210       0.201 
## # ℹ 79 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## #   `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## #   `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## #   `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## #   Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## #   `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
snvs_ins_del_clean %>%
  filter(Codon == 537) %>%
  filter(Consequence == "missense") %>%
  filter(Study == "This Study")
## # A tibble: 37 × 41
##    Exposure Tissue       Dose Position Ref   Alt   `Tech Rep1` `Tech Rep2`
##    <chr>    <chr>       <dbl>    <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 BaP      Bone Marrow   100     1627 G     A          0.0101      0.01  
##  2 BaP      Bone Marrow   100     1627 G     C          0.0026      0.0025
##  3 BaP      Bone Marrow   100     1627 G     C          0.0029      0.0026
##  4 BaP      Bone Marrow   100     1627 G     C          0.0031      0.0034
##  5 BaP      Bone Marrow   100     1627 G     A          0.0047      0.004 
##  6 Control  Bone Marrow     0     1627 G     A          0.746       0.746 
##  7 Control  Bone Marrow     0     1627 G     A          0.272       0.276 
##  8 Control  Bone Marrow     0     1627 G     A          0.0298      0.0305
##  9 Control  Bone Marrow     0     1627 G     A          0.171       0.178 
## 10 Control  Bone Marrow     0     1627 G     A          0.210       0.201 
## # ℹ 27 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## #   `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## #   `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## #   `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## #   Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## #   `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# Arg786
snvs_ins_del_clean %>%
  filter(Codon == 786) %>%
  filter(Consequence == "missense") %>%
  filter(Technology == "NGS")
## # A tibble: 96 × 41
##    Exposure Tissue       Dose Position Ref   Alt   `Tech Rep1` `Tech Rep2`
##    <chr>    <chr>       <dbl>    <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 BaP      Bone Marrow   100     2374 C     A          0.0309      0.0316
##  2 BaP      Bone Marrow   100     2374 C     G          0.0144      0.0139
##  3 BaP      Bone Marrow   100     2374 C     A          0.0497      0.0517
##  4 BaP      Bone Marrow   100     2374 C     G          0.0029      0.0028
##  5 BaP      Bone Marrow   100     2374 C     A          0.0093      0.0098
##  6 BaP      Bone Marrow   100     2374 C     T          0.0076      0.0082
##  7 BaP      Bone Marrow   100     2374 C     A          0.0344      0.0309
##  8 BaP      Bone Marrow   100     2374 C     T          0.0035      0.004 
##  9 BaP      Bone Marrow   100     2374 C     G          0.0027      0.0023
## 10 BaP      Bone Marrow   100     2374 C     G          0.0087      0.0072
## # ℹ 86 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## #   `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## #   `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## #   `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## #   Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## #   `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# There are 143 mutations at Arg786
# 142 missense, 1 indel
snvs_ins_del_clean %>%
  dplyr::filter(Codon == 786) %>%
  group_by(Ref, Alt)
## # A tibble: 143 × 41
## # Groups:   Ref, Alt [7]
##    Exposure Tissue       Dose Position Ref   Alt   `Tech Rep1` `Tech Rep2`
##    <chr>    <chr>       <dbl>    <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 BaP      Bone Marrow   100     2374 C     A          0.0309      0.0316
##  2 BaP      Bone Marrow   100     2374 C     G          0.0144      0.0139
##  3 BaP      Bone Marrow   100     2374 C     A          0.0497      0.0517
##  4 BaP      Bone Marrow   100     2374 C     G          0.0029      0.0028
##  5 BaP      Bone Marrow   100     2374 C     A          0.0093      0.0098
##  6 BaP      Bone Marrow   100     2374 C     T          0.0076      0.0082
##  7 BaP      Bone Marrow   100     2374 C     A          0.0344      0.0309
##  8 BaP      Bone Marrow   100     2374 C     T          0.0035      0.004 
##  9 BaP      Bone Marrow   100     2374 C     G          0.0027      0.0023
## 10 BaP      Bone Marrow   100     2374 C     G          0.0087      0.0072
## # ℹ 133 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## #   `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## #   `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## #   `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## #   Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## #   `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
snvs_ins_del_clean %>%
  dplyr::filter(Codon == 786) %>%
  group_by(Ref, Alt) %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 143
# There are 113 mutations at Ser390
snvs_ins_del_clean %>%
  dplyr::filter(Codon == 390) %>%
  group_by(Ref, Alt)
## # A tibble: 113 × 41
## # Groups:   Ref, Alt [4]
##    Exposure Tissue       Dose Position Ref   Alt   `Tech Rep1` `Tech Rep2`
##    <chr>    <chr>       <dbl>    <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 BaP      Bone Marrow   100     1187 C     T          0.0038      0.0043
##  2 BaP      Bone Marrow   100     1187 C     G          0.0044      0.0037
##  3 BaP      Bone Marrow   100     1187 C     A          0.0028      0.0028
##  4 BaP      Bone Marrow   100     1187 C     A          0.0044      0.0043
##  5 BaP      Bone Marrow   100     1187 C     A          0.009       0.0087
##  6 BaP      Bone Marrow   100     1187 C     A          0.0117      0.0122
##  7 BaP      Bone Marrow   100     1187 C     A          0.0053      0.005 
##  8 BaP      Bone Marrow   100     1188 G     C          0.0025      0.0026
##  9 BaP-IU   Bone Marrow    20     1187 C     T          0.230       0.175 
## 10 BaP-IU   Bone Marrow    40     1187 C     T          0.0183      0.0154
## # ℹ 103 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## #   `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## #   `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## #   `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## #   Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## #   `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
snvs_ins_del_clean %>%
  dplyr::filter(Codon == 390) %>%
  group_by(Ref, Alt) %>%
  tally() %>%
  pull(n) %>%
  sum()
## [1] 113
snvs_ins_del_clean %>%
  dplyr::filter(PositionRef == 1169) %>%
  group_by(Ref, Alt) %>%
  dplyr::filter(Type == "SNV") %>%
  count() # 112 SNVs at position 1,169
## # A tibble: 3 × 3
## # Groups:   Ref, Alt [3]
##   Ref   Alt       n
##   <chr> <chr> <int>
## 1 C     A        12
## 2 C     G         2
## 3 C     T        98

Data Summary

## Summary of data
vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Technology Type Consequence",
  prunebelow = list(Type = c("Insertion", "Deletion"))
)

vtree(snvs_ins_del_clean,
  vars = " Exposure codon_position "
)

vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Exposure Study"
)

vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Exposure"
)

vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Exposure Technology"
)

vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Exposure Tissue"
)

vtree(snvs_ins_del_clean %>% dplyr::filter(!is.na(Consequence)),
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Consequence"
)

vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Technology Consequence"
)

vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Consequence Technology"
)

codon_composition_tech <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::select(Codon, Technology) %>%
  dplyr::group_by(Codon) %>%
  dplyr::distinct() %>%
  mutate(Technology = paste0(Technology, collapse = ", ")) %>%
  dplyr::distinct()

codon_composition_exp <- snvs_ins_del_clean %>%
  dplyr::filter(Consequence == "missense") %>%
  dplyr::select(Exposure, Codon, Technology) %>%
  dplyr::group_by(Codon) %>%
  dplyr::distinct()

vtree(codon_composition_tech,
  title = "Codons",
  # imageFileOnly = TRUE, # to output a PNG
  "Technology"
)

vtree(codon_composition_exp,
  title = "Codons",
  # imageFileOnly = TRUE, # to output a PNG
  "Exposure Technology",
  pattern = T
)

vtree(snvs_ins_del_clean,
  title = "Total mutations",
  # imageFileOnly=TRUE, # to output a PNG
  "Type Technology"
)

exp_codons <- snvs_ins_del_clean %>%
  dplyr::group_by(Codon) %>%
  dplyr::filter(!Exposure == "Control") %>%
  pull(Codon)

exp_codons_ungrouped <- snvs_ins_del_clean %>%
  dplyr::filter(!Exposure == "Control") %>%
  pull(Codon)

controls_only <- snvs_ins_del_clean %>%
  dplyr::filter(!Codon %in% exp_codons)

# This calculation ONLY accounts for unique codons
controls_only %>%
  dplyr::group_by(Codon) %>%
  tally() %>%
  pull(Codon)
##  [1]   25   55   77  112  164  166  169  176  183  187  190  294  323  360  362
## [16]  366  379  395  407  440  448  492  515  535  625  648  651  653  673  684
## [31]  693  733  770  782  796  800  811  832  843  846  872  921  925 1008 1022
# This calculation is for everything, like above
control_only_aa_all <- snvs_ins_del_clean %>%
  dplyr::filter(Exposure == "Control") %>%
  dplyr::select(Codon) %>%
  pull()
non_control_aa_all <- snvs_ins_del_clean %>%
  dplyr::filter(!Exposure == "Control") %>%
  dplyr::select(Codon) %>%
  pull()
control_only_aa_all[!control_only_aa_all %in% non_control_aa_all] %>% unique()
##  [1]   55   77  323  379  440  535  673  832  625  684  693  843 1008  846  651
## [16]  770  921  112  166  782  190  360  169  800  811  362  872  648  796  448
## [31]  294  366  395  492  515  733  407  925  176  183  187  653  164 1022   25
vtree(controls_only,
  title = "Control-only mutations",
  "Tissue"
)

controls_only_ranked <- controls_only %>%
  dplyr::group_by(Codon) %>%
  tally() %>%
  arrange(-n)

controls_only_ranked$Codon <- factor(controls_only_ranked$Codon,
  levels = controls_only_ranked$Codon
)

ggplot(controls_only_ranked, aes(x = Codon, y = n, group = Codon)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90))

write.table(snvs_ins_del_clean,
  file = "../data/processed/summary.txt",
  quote = F, sep = "\t", row.names = F
)
write.table(controls_only,
  file = "../data/processed/controls_only.txt",
  quote = F, sep = "\t", row.names = F
)

breakdown_types <-
  snvs_ins_del_clean %>%
  mutate(Effect = case_when(
    Deletion == 1 ~ "Indel",
    Insertion == 1 ~ "Indel",
    Ref == "INS" ~ "Indel",
    Alt == "DEL" ~ "Indel",
    str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "Indel",
    str_length(Ref) < str_length(Alt) ~ "Indel",
    str_length(Ref) > str_length(Alt) ~ "Indel",
    Consequence == "frameshift" ~ "Indel",
    Consequence == "nonsense" ~ "Nonsense",
    Consequence == "silent" ~ "Silent",
    Consequence == "missense" ~ "Missense",
    Consequence == "stop lost" ~ "Stop Lost",
    Consequence == "complex" ~ "Other",
    str_length(Ref) > 1 & str_length(Ref) == str_length(Alt) ~ "Other",
    !str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "SNV"
  ))

vtree(breakdown_types,
  title = "Total mutations",
  # imageFileOnly = T,
  "Effect Technology"
)

breakdown_types %>%
  dplyr::rename("Total" = Effect) %>%
  vtree(
    title = "Total mutations",
    # imageFileOnly = T,
    "Total",
    arrowhead = "none",
    edgeattr = "style=invis",
    showroot = F
  )

vtree(breakdown_types,
  title = "Total mutations",
  # imageFileOnly = T,
  "Type Consequence Technology",
  keep = list(Consequence = c("missense", "nonsense", "silent", "stop lost")),
  vp = F
)

Specific types of amino acid changes and their prevalence in different structural features

# Want to know whether we find different substitutions in secondary structure

structure_analysis_specific <- structure_analysis %>%
  dplyr::mutate(is_ss = ifelse(DSSP == "Coil", F, T)) %>%
  dplyr::mutate(aa_change, ifelse(`Ref A.A.` == `Alt A.A.`, aa_change, "None"))
structure_analysis_specific$aa_change <- factor(structure_analysis_specific$aa_change)

changes_and_ss <- table(
  structure_analysis_specific$aa_change,
  structure_analysis_specific$is_ss
)
chi_squared_result_ss <- chisq.test(changes_and_ss)
print(chi_squared_result_ss)
## 
##  Pearson's Chi-squared test
## 
## data:  changes_and_ss
## X-squared = 165.52, df = 132, p-value = 0.02556
# Fit regression model for whether certain amino acid changes are specific to secondary structure
model <- glm(is_ss ~ aa_change, data = structure_analysis_specific, family = binomial)

# Get summary of the model
summary(model)
## 
## Call:
## glm(formula = is_ss ~ aa_change, family = binomial, data = structure_analysis_specific)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)   1.857e+01  2.663e+03   0.007    0.994
## aa_changeA>E -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeA>G -1.828e+01  2.663e+03  -0.007    0.995
## aa_changeA>P -1.801e+01  2.663e+03  -0.007    0.995
## aa_changeA>S -1.759e+01  2.663e+03  -0.007    0.995
## aa_changeA>T -1.765e+01  2.663e+03  -0.007    0.995
## aa_changeA>V -1.696e+01  2.663e+03  -0.006    0.995
## aa_changeC>F  3.101e-07  4.612e+03   0.000    1.000
## aa_changeC>R -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeC>S  3.131e-07  5.326e+03   0.000    1.000
## aa_changeC>W  3.098e-07  4.612e+03   0.000    1.000
## aa_changeC>Y  3.100e-07  4.612e+03   0.000    1.000
## aa_changeD>A -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeD>E -1.706e+01  2.663e+03  -0.006    0.995
## aa_changeD>G  3.098e-07  4.210e+03   0.000    1.000
## aa_changeD>H -1.816e+01  2.663e+03  -0.007    0.995
## aa_changeD>N -1.754e+01  2.663e+03  -0.007    0.995
## aa_changeD>V -1.662e+01  2.663e+03  -0.006    0.995
## aa_changeD>Y -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeE>A -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeE>D -1.662e+01  2.663e+03  -0.006    0.995
## aa_changeE>G -1.662e+01  2.663e+03  -0.006    0.995
## aa_changeE>K -1.703e+01  2.663e+03  -0.006    0.995
## aa_changeE>Q -1.747e+01  2.663e+03  -0.007    0.995
## aa_changeE>V -1.731e+01  2.663e+03  -0.007    0.995
## aa_changeF>C -3.713e+01  7.045e+03  -0.005    0.996
## aa_changeF>I  3.132e-07  5.326e+03   0.000    1.000
## aa_changeF>L -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeF>V -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeF>Y  3.093e-07  7.045e+03   0.000    1.000
## aa_changeG>A -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeG>C -1.727e+01  2.663e+03  -0.006    0.995
## aa_changeG>D -1.795e+01  2.663e+03  -0.007    0.995
## aa_changeG>E -1.696e+01  2.663e+03  -0.006    0.995
## aa_changeG>R -1.762e+01  2.663e+03  -0.007    0.995
## aa_changeG>S -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeG>V -1.747e+01  2.663e+03  -0.007    0.995
## aa_changeG>W -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeH>D -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeH>L -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeH>N -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeH>P -1.765e+01  2.663e+03  -0.007    0.995
## aa_changeH>Q -1.806e+01  2.663e+03  -0.007    0.995
## aa_changeH>R -1.885e+01  2.663e+03  -0.007    0.994
## aa_changeH>Y -1.897e+01  2.663e+03  -0.007    0.994
## aa_changeI>F -1.677e+01  2.663e+03  -0.006    0.995
## aa_changeI>M -3.713e+01  7.045e+03  -0.005    0.996
## aa_changeI>N  3.097e-07  3.523e+03   0.000    1.000
## aa_changeI>T  3.093e-07  7.045e+03   0.000    1.000
## aa_changeI>V -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeK>E -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeK>N  3.096e-07  4.612e+03   0.000    1.000
## aa_changeK>Q  3.093e-07  5.326e+03   0.000    1.000
## aa_changeL>F -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeL>H -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeL>I -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeL>M -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeL>P -1.703e+01  2.663e+03  -0.006    0.995
## aa_changeL>Q -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeL>R -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeL>V -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeM>I -1.816e+01  2.663e+03  -0.007    0.995
## aa_changeM>K -1.747e+01  2.663e+03  -0.007    0.995
## aa_changeM>L -1.787e+01  2.663e+03  -0.007    0.995
## aa_changeM>R -1.816e+01  2.663e+03  -0.007    0.995
## aa_changeM>T  3.135e-07  7.045e+03   0.000    1.000
## aa_changeM>V  3.093e-07  5.326e+03   0.000    1.000
## aa_changeN>D  3.139e-07  4.210e+03   0.000    1.000
## aa_changeN>H -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeN>I -1.772e+01  2.663e+03  -0.007    0.995
## aa_changeN>K -1.765e+01  2.663e+03  -0.007    0.995
## aa_changeN>S  3.093e-07  4.612e+03   0.000    1.000
## aa_changeN>T -1.926e+01  2.663e+03  -0.007    0.994
## aa_changeN>Y  3.135e-07  3.766e+03   0.000    1.000
## aa_changeP>A -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeP>H -1.926e+01  2.663e+03  -0.007    0.994
## aa_changeP>L -1.841e+01  2.663e+03  -0.007    0.994
## aa_changeP>Q -1.977e+01  2.663e+03  -0.007    0.994
## aa_changeP>R -1.948e+01  2.663e+03  -0.007    0.994
## aa_changeP>S -1.828e+01  2.663e+03  -0.007    0.995
## aa_changeP>T -1.838e+01  2.663e+03  -0.007    0.994
## aa_changeQ>E  3.130e-07  4.210e+03   0.000    1.000
## aa_changeQ>H -1.696e+01  2.663e+03  -0.006    0.995
## aa_changeQ>K -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeQ>L -3.713e+01  4.612e+03  -0.008    0.994
## aa_changeQ>P -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeQ>R -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeR>C -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeR>G -1.747e+01  2.663e+03  -0.007    0.995
## aa_changeR>H -1.696e+01  2.663e+03  -0.006    0.995
## aa_changeR>L -1.739e+01  2.663e+03  -0.007    0.995
## aa_changeR>P -1.696e+01  2.663e+03  -0.006    0.995
## aa_changeR>Q  3.132e-07  5.326e+03   0.000    1.000
## aa_changeR>S -1.823e+01  2.663e+03  -0.007    0.995
## aa_changeR>W  3.137e-07  7.045e+03   0.000    1.000
## aa_changeS>C -1.816e+01  2.663e+03  -0.007    0.995
## aa_changeS>F  3.094e-07  7.045e+03   0.000    1.000
## aa_changeS>G -1.926e+01  2.663e+03  -0.007    0.994
## aa_changeS>I -1.926e+01  2.663e+03  -0.007    0.994
## aa_changeS>L -1.696e+01  2.663e+03  -0.006    0.995
## aa_changeS>N -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeS>P -1.857e+01  2.663e+03  -0.007    0.994
## aa_changeS>R -1.875e+01  2.663e+03  -0.007    0.994
## aa_changeS>T -3.713e+01  7.045e+03  -0.005    0.996
## aa_changeS>W  3.093e-07  4.612e+03   0.000    1.000
## aa_changeS>Y  3.139e-07  7.045e+03   0.000    1.000
## aa_changeT>A -1.747e+01  2.663e+03  -0.007    0.995
## aa_changeT>I  3.140e-07  5.326e+03   0.000    1.000
## aa_changeT>K  3.138e-07  4.210e+03   0.000    1.000
## aa_changeT>M  3.138e-07  4.612e+03   0.000    1.000
## aa_changeT>N -3.713e+01  7.045e+03  -0.005    0.996
## aa_changeT>P -1.828e+01  2.663e+03  -0.007    0.995
## aa_changeT>R  3.132e-07  4.612e+03   0.000    1.000
## aa_changeT>S -1.747e+01  2.663e+03  -0.007    0.995
## aa_changeV>A  3.100e-07  3.950e+03   0.000    1.000
## aa_changeV>D  3.094e-07  3.629e+03   0.000    1.000
## aa_changeV>E  3.093e-07  4.612e+03   0.000    1.000
## aa_changeV>F  3.140e-07  3.766e+03   0.000    1.000
## aa_changeV>G  3.094e-07  5.326e+03   0.000    1.000
## aa_changeV>I -1.718e+01  2.663e+03  -0.006    0.995
## aa_changeV>L  3.100e-07  3.523e+03   0.000    1.000
## aa_changeV>M  3.140e-07  4.210e+03   0.000    1.000
## aa_changeW>C  3.135e-07  3.368e+03   0.000    1.000
## aa_changeW>G  3.101e-07  4.612e+03   0.000    1.000
## aa_changeW>L  3.094e-07  3.629e+03   0.000    1.000
## aa_changeW>R  3.131e-07  3.219e+03   0.000    1.000
## aa_changeW>S  3.094e-07  5.326e+03   0.000    1.000
## aa_changeY>C -1.765e+01  2.663e+03  -0.007    0.995
## aa_changeY>D  3.132e-07  4.612e+03   0.000    1.000
## aa_changeY>F  3.094e-07  4.612e+03   0.000    1.000
## aa_changeY>H  3.139e-07  5.326e+03   0.000    1.000
## aa_changeY>N  3.093e-07  4.210e+03   0.000    1.000
## aa_changeY>S -1.857e+01  2.663e+03  -0.007    0.994
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.93  on 861  degrees of freedom
## Residual deviance: 798.20  on 729  degrees of freedom
## AIC: 1064.2
## 
## Number of Fisher Scoring iterations: 17
plot(model)

results_specific <- as.data.frame(summary(model)$coefficient)
results_specific$Odds_ratio <- exp(results_specific[["Estimate"]])
results_specific_sig <- results_specific %>%
  dplyr::filter(`Pr(>|z|)` < 0.05) %>%
  dplyr::arrange(Odds_ratio)

# Fit regression model
# Is the reference AA important for the type of SS?
model <- glm(DSSP ~ aa_change + DSSP:aa_change, data = structure_analysis_specific, family = binomial)

# Get summary of the model
summary(model)
## 
## Call:
## glm(formula = DSSP ~ aa_change + DSSP:aa_change, family = binomial, 
##     data = structure_analysis_specific)
## 
## Coefficients: (637 not defined because of singularities)
##                              Estimate Std. Error z value Pr(>|z|)
## (Intercept)                 2.657e+01  2.518e+05       0        1
## aa_changeA>E               -5.313e+01  2.980e+05       0        1
## aa_changeA>G               -5.313e+01  3.251e+05       0        1
## aa_changeA>P               -5.313e+01  3.084e+05       0        1
## aa_changeA>S               -5.313e+01  3.251e+05       0        1
## aa_changeA>T               -5.313e+01  3.561e+05       0        1
## aa_changeA>V               -5.313e+01  4.362e+05       0        1
## aa_changeC>F                4.418e-06  4.362e+05       0        1
## aa_changeC>R               -5.313e+01  3.561e+05       0        1
## aa_changeC>S                4.622e-06  3.561e+05       0        1
## aa_changeC>W               -1.915e-08  3.561e+05       0        1
## aa_changeC>Y               -1.240e-08  4.362e+05       0        1
## aa_changeD>A               -5.313e+01  3.251e+05       0        1
## aa_changeD>E               -5.313e+01  3.561e+05       0        1
## aa_changeD>G               -1.909e-07  4.362e+05       0        1
## aa_changeD>H               -5.313e+01  3.561e+05       0        1
## aa_changeD>N               -5.313e+01  2.980e+05       0        1
## aa_changeD>V               -5.313e+01  4.362e+05       0        1
## aa_changeD>Y               -5.313e+01  3.251e+05       0        1
## aa_changeE>A               -5.313e+01  4.362e+05       0        1
## aa_changeE>D               -5.313e+01  4.362e+05       0        1
## aa_changeE>G               -5.313e+01  4.362e+05       0        1
## aa_changeE>K               -5.313e+01  3.251e+05       0        1
## aa_changeE>Q               -5.313e+01  4.362e+05       0        1
## aa_changeE>V               -5.313e+01  3.561e+05       0        1
## aa_changeF>C               -5.313e+01  4.362e+05       0        1
## aa_changeF>I                4.633e-06  4.362e+05       0        1
## aa_changeF>L               -5.313e+01  3.251e+05       0        1
## aa_changeF>V               -5.313e+01  4.362e+05       0        1
## aa_changeF>Y                4.429e-06  4.362e+05       0        1
## aa_changeG>A               -5.313e+01  4.362e+05       0        1
## aa_changeG>C               -5.313e+01  3.251e+05       0        1
## aa_changeG>D               -5.313e+01  2.855e+05       0        1
## aa_changeG>E               -5.313e+01  4.362e+05       0        1
## aa_changeG>R               -5.313e+01  2.855e+05       0        1
## aa_changeG>S               -5.313e+01  3.084e+05       0        1
## aa_changeG>V               -5.313e+01  2.815e+05       0        1
## aa_changeG>W               -5.313e+01  4.362e+05       0        1
## aa_changeH>D               -5.313e+01  4.362e+05       0        1
## aa_changeH>L               -5.313e+01  3.561e+05       0        1
## aa_changeH>N               -5.313e+01  4.362e+05       0        1
## aa_changeH>P               -5.313e+01  3.561e+05       0        1
## aa_changeH>Q               -5.313e+01  3.251e+05       0        1
## aa_changeH>R               -5.313e+01  3.084e+05       0        1
## aa_changeH>Y               -5.313e+01  3.251e+05       0        1
## aa_changeI>F               -5.313e+01  4.362e+05       0        1
## aa_changeI>M               -5.313e+01  4.362e+05       0        1
## aa_changeI>N                4.625e-06  2.908e+05       0        1
## aa_changeI>T                4.639e-06  4.362e+05       0        1
## aa_changeI>V               -5.313e+01  3.561e+05       0        1
## aa_changeK>E               -5.313e+01  4.362e+05       0        1
## aa_changeK>N               -1.867e-07  4.362e+05       0        1
## aa_changeK>Q                4.631e-06  4.362e+05       0        1
## aa_changeL>F               -5.313e+01  4.362e+05       0        1
## aa_changeL>H               -5.313e+01  4.362e+05       0        1
## aa_changeL>I               -5.313e+01  4.362e+05       0        1
## aa_changeL>M               -5.313e+01  3.251e+05       0        1
## aa_changeL>P               -5.313e+01  3.251e+05       0        1
## aa_changeL>Q               -5.313e+01  3.561e+05       0        1
## aa_changeL>R               -5.313e+01  3.561e+05       0        1
## aa_changeL>V               -5.313e+01  4.362e+05       0        1
## aa_changeM>I               -5.313e+01  3.561e+05       0        1
## aa_changeM>K               -5.313e+01  4.362e+05       0        1
## aa_changeM>L               -5.313e+01  4.362e+05       0        1
## aa_changeM>R               -5.313e+01  3.561e+05       0        1
## aa_changeM>T                4.414e-06  4.362e+05       0        1
## aa_changeM>V               -1.792e-07  4.362e+05       0        1
## aa_changeN>D                4.429e-06  4.362e+05       0        1
## aa_changeN>H               -5.313e+01  4.362e+05       0        1
## aa_changeN>I               -5.313e+01  3.251e+05       0        1
## aa_changeN>K               -5.313e+01  3.084e+05       0        1
## aa_changeN>S                4.437e-06  3.561e+05       0        1
## aa_changeN>T               -5.313e+01  3.561e+05       0        1
## aa_changeN>Y               -1.292e-08  4.362e+05       0        1
## aa_changeP>A               -5.313e+01  3.561e+05       0        1
## aa_changeP>H               -5.313e+01  3.561e+05       0        1
## aa_changeP>L               -5.313e+01  2.908e+05       0        1
## aa_changeP>Q               -5.313e+01  2.759e+05       0        1
## aa_changeP>R               -5.313e+01  2.980e+05       0        1
## aa_changeP>S               -5.313e+01  3.251e+05       0        1
## aa_changeP>T               -5.313e+01  2.980e+05       0        1
## aa_changeQ>E                4.456e-06  3.561e+05       0        1
## aa_changeQ>H               -5.313e+01  3.561e+05       0        1
## aa_changeQ>K               -5.313e+01  2.980e+05       0        1
## aa_changeQ>L               -5.313e+01  3.251e+05       0        1
## aa_changeQ>P               -5.313e+01  4.362e+05       0        1
## aa_changeQ>R               -5.313e+01  4.362e+05       0        1
## aa_changeR>C               -5.313e+01  3.251e+05       0        1
## aa_changeR>G               -5.313e+01  3.561e+05       0        1
## aa_changeR>H               -5.313e+01  3.561e+05       0        1
## aa_changeR>L               -5.313e+01  3.084e+05       0        1
## aa_changeR>P               -5.313e+01  3.251e+05       0        1
## aa_changeR>Q                4.642e-06  3.561e+05       0        1
## aa_changeR>S               -5.313e+01  2.980e+05       0        1
## aa_changeR>W               -1.403e-08  4.362e+05       0        1
## aa_changeS>C               -5.313e+01  3.561e+05       0        1
## aa_changeS>F                4.438e-06  4.362e+05       0        1
## aa_changeS>G               -5.313e+01  3.561e+05       0        1
## aa_changeS>I               -5.313e+01  3.561e+05       0        1
## aa_changeS>L               -5.313e+01  4.362e+05       0        1
## aa_changeS>N               -5.313e+01  3.561e+05       0        1
## aa_changeS>P               -5.313e+01  3.084e+05       0        1
## aa_changeS>R               -5.313e+01  2.908e+05       0        1
## aa_changeS>T               -5.313e+01  4.362e+05       0        1
## aa_changeS>W                4.415e-06  4.362e+05       0        1
## aa_changeS>Y                4.423e-06  4.362e+05       0        1
## aa_changeT>A               -5.313e+01  4.362e+05       0        1
## aa_changeT>I               -4.102e-09  4.362e+05       0        1
## aa_changeT>K               -2.201e-07  3.251e+05       0        1
## aa_changeT>M                4.418e-06  4.362e+05       0        1
## aa_changeT>N               -5.313e+01  4.362e+05       0        1
## aa_changeT>P               -5.313e+01  3.251e+05       0        1
## aa_changeT>R                4.629e-06  3.251e+05       0        1
## aa_changeT>S               -5.313e+01  4.362e+05       0        1
## aa_changeV>A                4.625e-06  2.980e+05       0        1
## aa_changeV>D               -1.797e-10  2.855e+05       0        1
## aa_changeV>E               -1.991e-07  3.251e+05       0        1
## aa_changeV>F                4.423e-06  2.980e+05       0        1
## aa_changeV>G               -1.834e-07  4.362e+05       0        1
## aa_changeV>I               -5.313e+01  4.362e+05       0        1
## aa_changeV>L                4.415e-06  2.980e+05       0        1
## aa_changeV>M                3.387e-10  3.561e+05       0        1
## aa_changeW>C                4.434e-06  3.251e+05       0        1
## aa_changeW>G                2.176e-09  4.362e+05       0        1
## aa_changeW>L                1.066e-08  4.362e+05       0        1
## aa_changeW>R                4.417e-06  4.362e+05       0        1
## aa_changeW>S                4.422e-06  4.362e+05       0        1
## aa_changeY>C               -5.313e+01  3.561e+05       0        1
## aa_changeY>D                2.042e-09  3.561e+05       0        1
## aa_changeY>F                4.439e-06  3.561e+05       0        1
## aa_changeY>H                7.669e-09  4.362e+05       0        1
## aa_changeY>N                9.123e-09  4.362e+05       0        1
## aa_changeY>S               -5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeA>D    4.415e-06  4.362e+05       0        1
## DSSPBend:aa_changeA>D              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeA>D          NA         NA      NA       NA
## DSSP310 Helix:aa_changeA>D         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeA>D           NA         NA      NA       NA
## DSSPβ-strand:aa_changeA>D   4.625e-06  3.251e+05       0        1
## DSSPTurn:aa_changeA>D              NA         NA      NA       NA
## DSSPα-helix:aa_changeA>E           NA         NA      NA       NA
## DSSPBend:aa_changeA>E       5.313e+01  2.980e+05       0        1
## DSSPβ-bridge:aa_changeA>E          NA         NA      NA       NA
## DSSP310 Helix:aa_changeA>E         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeA>E           NA         NA      NA       NA
## DSSPβ-strand:aa_changeA>E   5.313e+01  2.601e+05       0        1
## DSSPTurn:aa_changeA>E              NA         NA      NA       NA
## DSSPα-helix:aa_changeA>G           NA         NA      NA       NA
## DSSPBend:aa_changeA>G       5.313e+01  4.112e+05       0        1
## DSSPβ-bridge:aa_changeA>G          NA         NA      NA       NA
## DSSP310 Helix:aa_changeA>G         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeA>G           NA         NA      NA       NA
## DSSPβ-strand:aa_changeA>G   5.313e+01  3.251e+05       0        1
## DSSPTurn:aa_changeA>G       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeA>P           NA         NA      NA       NA
## DSSPBend:aa_changeA>P       5.313e+01  2.720e+05       0        1
## DSSPβ-bridge:aa_changeA>P          NA         NA      NA       NA
## DSSP310 Helix:aa_changeA>P         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeA>P           NA         NA      NA       NA
## DSSPβ-strand:aa_changeA>P   5.313e+01  2.518e+05       0        1
## DSSPTurn:aa_changeA>P              NA         NA      NA       NA
## DSSPα-helix:aa_changeA>S           NA         NA      NA       NA
## DSSPBend:aa_changeA>S       5.313e+01  2.908e+05       0        1
## DSSPβ-bridge:aa_changeA>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeA>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeA>S           NA         NA      NA       NA
## DSSPβ-strand:aa_changeA>S   5.313e+01  2.720e+05       0        1
## DSSPTurn:aa_changeA>S       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeA>T    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeA>T              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeA>T          NA         NA      NA       NA
## DSSP310 Helix:aa_changeA>T         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeA>T           NA         NA      NA       NA
## DSSPβ-strand:aa_changeA>T   5.313e+01  3.084e+05       0        1
## DSSPTurn:aa_changeA>T              NA         NA      NA       NA
## DSSPα-helix:aa_changeA>V    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeA>V       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeA>V          NA         NA      NA       NA
## DSSP310 Helix:aa_changeA>V  5.313e+01  5.036e+05       0        1
## DSSPÏ€-helix:aa_changeA>V           NA         NA      NA       NA
## DSSPβ-strand:aa_changeA>V   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeA>V              NA         NA      NA       NA
## DSSPα-helix:aa_changeC>F           NA         NA      NA       NA
## DSSPBend:aa_changeC>F              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeC>F          NA         NA      NA       NA
## DSSP310 Helix:aa_changeC>F  1.670e-08  5.036e+05       0        1
## DSSPÏ€-helix:aa_changeC>F    2.421e-08  5.036e+05       0        1
## DSSPβ-strand:aa_changeC>F          NA         NA      NA       NA
## DSSPTurn:aa_changeC>F              NA         NA      NA       NA
## DSSPα-helix:aa_changeC>R           NA         NA      NA       NA
## DSSPBend:aa_changeC>R              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeC>R          NA         NA      NA       NA
## DSSP310 Helix:aa_changeC>R  5.313e+01  4.362e+05       0        1
## DSSPÏ€-helix:aa_changeC>R    5.313e+01  4.362e+05       0        1
## DSSPβ-strand:aa_changeC>R   5.313e+01  3.561e+05       0        1
## DSSPTurn:aa_changeC>R              NA         NA      NA       NA
## DSSPα-helix:aa_changeC>S           NA         NA      NA       NA
## DSSPBend:aa_changeC>S              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeC>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeC>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeC>S           NA         NA      NA       NA
## DSSPβ-strand:aa_changeC>S          NA         NA      NA       NA
## DSSPTurn:aa_changeC>S              NA         NA      NA       NA
## DSSPα-helix:aa_changeC>W           NA         NA      NA       NA
## DSSPBend:aa_changeC>W              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeC>W          NA         NA      NA       NA
## DSSP310 Helix:aa_changeC>W         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeC>W    4.439e-06  4.362e+05       0        1
## DSSPβ-strand:aa_changeC>W          NA         NA      NA       NA
## DSSPTurn:aa_changeC>W              NA         NA      NA       NA
## DSSPα-helix:aa_changeC>Y           NA         NA      NA       NA
## DSSPBend:aa_changeC>Y              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeC>Y          NA         NA      NA       NA
## DSSP310 Helix:aa_changeC>Y -2.153e-07  5.036e+05       0        1
## DSSPÏ€-helix:aa_changeC>Y    4.643e-06  5.036e+05       0        1
## DSSPβ-strand:aa_changeC>Y          NA         NA      NA       NA
## DSSPTurn:aa_changeC>Y              NA         NA      NA       NA
## DSSPα-helix:aa_changeD>A    5.313e+01  4.112e+05       0        1
## DSSPBend:aa_changeD>A              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeD>A          NA         NA      NA       NA
## DSSP310 Helix:aa_changeD>A         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeD>A           NA         NA      NA       NA
## DSSPβ-strand:aa_changeD>A          NA         NA      NA       NA
## DSSPTurn:aa_changeD>A       5.313e+01  3.251e+05       0        1
## DSSPα-helix:aa_changeD>E    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeD>E       5.313e+01  3.561e+05       0        1
## DSSPβ-bridge:aa_changeD>E          NA         NA      NA       NA
## DSSP310 Helix:aa_changeD>E  5.313e+01  4.362e+05       0        1
## DSSPÏ€-helix:aa_changeD>E           NA         NA      NA       NA
## DSSPβ-strand:aa_changeD>E   5.313e+01  3.561e+05       0        1
## DSSPTurn:aa_changeD>E       5.313e+01  3.251e+05       0        1
## DSSPα-helix:aa_changeD>G   -2.880e-08  5.036e+05       0        1
## DSSPBend:aa_changeD>G      -3.123e-08  4.362e+05       0        1
## DSSPβ-bridge:aa_changeD>G          NA         NA      NA       NA
## DSSP310 Helix:aa_changeD>G         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeD>G           NA         NA      NA       NA
## DSSPβ-strand:aa_changeD>G          NA         NA      NA       NA
## DSSPTurn:aa_changeD>G              NA         NA      NA       NA
## DSSPα-helix:aa_changeD>H           NA         NA      NA       NA
## DSSPBend:aa_changeD>H       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeD>H          NA         NA      NA       NA
## DSSP310 Helix:aa_changeD>H         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeD>H    5.313e+01  4.362e+05       0        1
## DSSPβ-strand:aa_changeD>H          NA         NA      NA       NA
## DSSPTurn:aa_changeD>H       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeD>N    5.313e+01  2.980e+05       0        1
## DSSPBend:aa_changeD>N       5.313e+01  2.252e+05       0        1
## DSSPβ-bridge:aa_changeD>N          NA         NA      NA       NA
## DSSP310 Helix:aa_changeD>N  5.313e+01  3.901e+05       0        1
## DSSPÏ€-helix:aa_changeD>N    5.313e+01  3.901e+05       0        1
## DSSPβ-strand:aa_changeD>N   5.313e+01  2.601e+05       0        1
## DSSPTurn:aa_changeD>N       5.313e+01  2.980e+05       0        1
## DSSPα-helix:aa_changeD>V    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeD>V       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeD>V          NA         NA      NA       NA
## DSSP310 Helix:aa_changeD>V         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeD>V           NA         NA      NA       NA
## DSSPβ-strand:aa_changeD>V   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeD>V       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeD>Y    5.313e+01  2.908e+05       0        1
## DSSPBend:aa_changeD>Y       5.313e+01  3.251e+05       0        1
## DSSPβ-bridge:aa_changeD>Y   5.313e+01  4.112e+05       0        1
## DSSP310 Helix:aa_changeD>Y  5.313e+01  4.112e+05       0        1
## DSSPÏ€-helix:aa_changeD>Y    5.313e+01  4.112e+05       0        1
## DSSPβ-strand:aa_changeD>Y   5.313e+01  2.908e+05       0        1
## DSSPTurn:aa_changeD>Y       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeE>A           NA         NA      NA       NA
## DSSPBend:aa_changeE>A              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeE>A   5.313e+01  5.036e+05       0        1
## DSSP310 Helix:aa_changeE>A         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeE>A           NA         NA      NA       NA
## DSSPβ-strand:aa_changeE>A   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeE>A              NA         NA      NA       NA
## DSSPα-helix:aa_changeE>D    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeE>D       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeE>D   5.313e+01  5.036e+05       0        1
## DSSP310 Helix:aa_changeE>D  5.313e+01  5.036e+05       0        1
## DSSPÏ€-helix:aa_changeE>D           NA         NA      NA       NA
## DSSPβ-strand:aa_changeE>D   5.313e+01  4.112e+05       0        1
## DSSPTurn:aa_changeE>D              NA         NA      NA       NA
## DSSPα-helix:aa_changeE>G           NA         NA      NA       NA
## DSSPBend:aa_changeE>G       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeE>G   5.313e+01  4.362e+05       0        1
## DSSP310 Helix:aa_changeE>G         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeE>G           NA         NA      NA       NA
## DSSPβ-strand:aa_changeE>G   5.313e+01  4.112e+05       0        1
## DSSPTurn:aa_changeE>G       5.313e+01  5.036e+05       0        1
## DSSPα-helix:aa_changeE>K           NA         NA      NA       NA
## DSSPBend:aa_changeE>K       5.313e+01  2.908e+05       0        1
## DSSPβ-bridge:aa_changeE>K   5.313e+01  3.251e+05       0        1
## DSSP310 Helix:aa_changeE>K  5.313e+01  4.112e+05       0        1
## DSSPÏ€-helix:aa_changeE>K           NA         NA      NA       NA
## DSSPβ-strand:aa_changeE>K   5.313e+01  2.457e+05       0        1
## DSSPTurn:aa_changeE>K       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeE>Q           NA         NA      NA       NA
## DSSPBend:aa_changeE>Q       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeE>Q          NA         NA      NA       NA
## DSSP310 Helix:aa_changeE>Q         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeE>Q           NA         NA      NA       NA
## DSSPβ-strand:aa_changeE>Q   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeE>Q              NA         NA      NA       NA
## DSSPα-helix:aa_changeE>V           NA         NA      NA       NA
## DSSPBend:aa_changeE>V       5.313e+01  3.561e+05       0        1
## DSSPβ-bridge:aa_changeE>V   5.313e+01  3.561e+05       0        1
## DSSP310 Helix:aa_changeE>V         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeE>V           NA         NA      NA       NA
## DSSPβ-strand:aa_changeE>V   5.313e+01  3.561e+05       0        1
## DSSPTurn:aa_changeE>V       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeF>C           NA         NA      NA       NA
## DSSPBend:aa_changeF>C              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeF>C          NA         NA      NA       NA
## DSSP310 Helix:aa_changeF>C         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeF>C           NA         NA      NA       NA
## DSSPβ-strand:aa_changeF>C          NA         NA      NA       NA
## DSSPTurn:aa_changeF>C              NA         NA      NA       NA
## DSSPα-helix:aa_changeF>I           NA         NA      NA       NA
## DSSPBend:aa_changeF>I      -2.035e-07  5.036e+05       0        1
## DSSPβ-bridge:aa_changeF>I          NA         NA      NA       NA
## DSSP310 Helix:aa_changeF>I         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeF>I           NA         NA      NA       NA
## DSSPβ-strand:aa_changeF>I          NA         NA      NA       NA
## DSSPTurn:aa_changeF>I              NA         NA      NA       NA
## DSSPα-helix:aa_changeF>L    5.313e+01  4.112e+05       0        1
## DSSPBend:aa_changeF>L       5.313e+01  4.112e+05       0        1
## DSSPβ-bridge:aa_changeF>L          NA         NA      NA       NA
## DSSP310 Helix:aa_changeF>L         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeF>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeF>L          NA         NA      NA       NA
## DSSPTurn:aa_changeF>L       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeF>V           NA         NA      NA       NA
## DSSPBend:aa_changeF>V              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeF>V          NA         NA      NA       NA
## DSSP310 Helix:aa_changeF>V         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeF>V           NA         NA      NA       NA
## DSSPβ-strand:aa_changeF>V   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeF>V              NA         NA      NA       NA
## DSSPα-helix:aa_changeF>Y           NA         NA      NA       NA
## DSSPBend:aa_changeF>Y              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeF>Y          NA         NA      NA       NA
## DSSP310 Helix:aa_changeF>Y         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeF>Y           NA         NA      NA       NA
## DSSPβ-strand:aa_changeF>Y          NA         NA      NA       NA
## DSSPTurn:aa_changeF>Y              NA         NA      NA       NA
## DSSPα-helix:aa_changeG>A           NA         NA      NA       NA
## DSSPBend:aa_changeG>A              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeG>A          NA         NA      NA       NA
## DSSP310 Helix:aa_changeG>A         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeG>A           NA         NA      NA       NA
## DSSPβ-strand:aa_changeG>A   5.313e+01  4.112e+05       0        1
## DSSPTurn:aa_changeG>A       5.313e+01  5.036e+05       0        1
## DSSPα-helix:aa_changeG>C           NA         NA      NA       NA
## DSSPBend:aa_changeG>C       5.313e+01  4.112e+05       0        1
## DSSPβ-bridge:aa_changeG>C          NA         NA      NA       NA
## DSSP310 Helix:aa_changeG>C         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeG>C    5.313e+01  4.112e+05       0        1
## DSSPβ-strand:aa_changeG>C   5.313e+01  2.908e+05       0        1
## DSSPTurn:aa_changeG>C       5.313e+01  2.518e+05       0        1
## DSSPα-helix:aa_changeG>D           NA         NA      NA       NA
## DSSPBend:aa_changeG>D       5.313e+01  3.807e+05       0        1
## DSSPβ-bridge:aa_changeG>D          NA         NA      NA       NA
## DSSP310 Helix:aa_changeG>D         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeG>D    5.313e+01  3.807e+05       0        1
## DSSPβ-strand:aa_changeG>D   5.313e+01  1.843e+05       0        1
## DSSPTurn:aa_changeG>D       5.313e+01  2.457e+05       0        1
## DSSPα-helix:aa_changeG>E           NA         NA      NA       NA
## DSSPBend:aa_changeG>E              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeG>E          NA         NA      NA       NA
## DSSP310 Helix:aa_changeG>E         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeG>E           NA         NA      NA       NA
## DSSPβ-strand:aa_changeG>E   5.313e+01  4.112e+05       0        1
## DSSPTurn:aa_changeG>E       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeG>R           NA         NA      NA       NA
## DSSPBend:aa_changeG>R       5.313e+01  3.807e+05       0        1
## DSSPβ-bridge:aa_changeG>R   5.313e+01  3.807e+05       0        1
## DSSP310 Helix:aa_changeG>R  5.313e+01  3.807e+05       0        1
## DSSPÏ€-helix:aa_changeG>R    5.313e+01  3.807e+05       0        1
## DSSPβ-strand:aa_changeG>R   5.313e+01  1.722e+05       0        1
## DSSPTurn:aa_changeG>R       5.313e+01  2.457e+05       0        1
## DSSPα-helix:aa_changeG>S           NA         NA      NA       NA
## DSSPBend:aa_changeG>S       5.313e+01  3.982e+05       0        1
## DSSPβ-bridge:aa_changeG>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeG>S  5.313e+01  3.982e+05       0        1
## DSSPÏ€-helix:aa_changeG>S    5.313e+01  3.982e+05       0        1
## DSSPβ-strand:aa_changeG>S   5.313e+01  2.232e+05       0        1
## DSSPTurn:aa_changeG>S       5.313e+01  2.299e+05       0        1
## DSSPα-helix:aa_changeG>V           NA         NA      NA       NA
## DSSPBend:aa_changeG>V       5.313e+01  3.777e+05       0        1
## DSSPβ-bridge:aa_changeG>V          NA         NA      NA       NA
## DSSP310 Helix:aa_changeG>V         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeG>V    5.313e+01  3.777e+05       0        1
## DSSPβ-strand:aa_changeG>V   5.313e+01  1.625e+05       0        1
## DSSPTurn:aa_changeG>V       5.313e+01  1.689e+05       0        1
## DSSPα-helix:aa_changeG>W           NA         NA      NA       NA
## DSSPBend:aa_changeG>W              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeG>W          NA         NA      NA       NA
## DSSP310 Helix:aa_changeG>W         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeG>W           NA         NA      NA       NA
## DSSPβ-strand:aa_changeG>W   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeG>W       5.313e+01  5.036e+05       0        1
## DSSPα-helix:aa_changeH>D           NA         NA      NA       NA
## DSSPBend:aa_changeH>D              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeH>D          NA         NA      NA       NA
## DSSP310 Helix:aa_changeH>D         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeH>D           NA         NA      NA       NA
## DSSPβ-strand:aa_changeH>D   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeH>D       5.313e+01  5.036e+05       0        1
## DSSPα-helix:aa_changeH>L    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeH>L              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeH>L   5.313e+01  4.362e+05       0        1
## DSSP310 Helix:aa_changeH>L         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeH>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeH>L          NA         NA      NA       NA
## DSSPTurn:aa_changeH>L       5.313e+01  3.561e+05       0        1
## DSSPα-helix:aa_changeH>N           NA         NA      NA       NA
## DSSPBend:aa_changeH>N              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeH>N          NA         NA      NA       NA
## DSSP310 Helix:aa_changeH>N         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeH>N           NA         NA      NA       NA
## DSSPβ-strand:aa_changeH>N   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeH>N       5.313e+01  5.036e+05       0        1
## DSSPα-helix:aa_changeH>P    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeH>P              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeH>P          NA         NA      NA       NA
## DSSP310 Helix:aa_changeH>P         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeH>P           NA         NA      NA       NA
## DSSPβ-strand:aa_changeH>P   5.313e+01  3.561e+05       0        1
## DSSPTurn:aa_changeH>P       5.313e+01  3.561e+05       0        1
## DSSPα-helix:aa_changeH>Q           NA         NA      NA       NA
## DSSPBend:aa_changeH>Q              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeH>Q          NA         NA      NA       NA
## DSSP310 Helix:aa_changeH>Q  5.313e+01  4.112e+05       0        1
## DSSPÏ€-helix:aa_changeH>Q           NA         NA      NA       NA
## DSSPβ-strand:aa_changeH>Q   5.313e+01  3.251e+05       0        1
## DSSPTurn:aa_changeH>Q       5.313e+01  3.251e+05       0        1
## DSSPα-helix:aa_changeH>R    5.313e+01  3.982e+05       0        1
## DSSPBend:aa_changeH>R              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeH>R          NA         NA      NA       NA
## DSSP310 Helix:aa_changeH>R         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeH>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeH>R          NA         NA      NA       NA
## DSSPTurn:aa_changeH>R       5.313e+01  3.084e+05       0        1
## DSSPα-helix:aa_changeH>Y           NA         NA      NA       NA
## DSSPBend:aa_changeH>Y              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeH>Y          NA         NA      NA       NA
## DSSP310 Helix:aa_changeH>Y         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeH>Y           NA         NA      NA       NA
## DSSPβ-strand:aa_changeH>Y          NA         NA      NA       NA
## DSSPTurn:aa_changeH>Y       5.313e+01  3.251e+05       0        1
## DSSPα-helix:aa_changeI>F    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeI>F       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeI>F          NA         NA      NA       NA
## DSSP310 Helix:aa_changeI>F         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeI>F           NA         NA      NA       NA
## DSSPβ-strand:aa_changeI>F   5.313e+01  3.982e+05       0        1
## DSSPTurn:aa_changeI>F              NA         NA      NA       NA
## DSSPα-helix:aa_changeI>M           NA         NA      NA       NA
## DSSPBend:aa_changeI>M              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeI>M          NA         NA      NA       NA
## DSSP310 Helix:aa_changeI>M         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeI>M           NA         NA      NA       NA
## DSSPβ-strand:aa_changeI>M          NA         NA      NA       NA
## DSSPTurn:aa_changeI>M              NA         NA      NA       NA
## DSSPα-helix:aa_changeI>N           NA         NA      NA       NA
## DSSPBend:aa_changeI>N      -1.919e-07  3.847e+05       0        1
## DSSPβ-bridge:aa_changeI>N   5.002e-09  3.847e+05       0        1
## DSSP310 Helix:aa_changeI>N         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeI>N           NA         NA      NA       NA
## DSSPβ-strand:aa_changeI>N          NA         NA      NA       NA
## DSSPTurn:aa_changeI>N              NA         NA      NA       NA
## DSSPα-helix:aa_changeI>T           NA         NA      NA       NA
## DSSPBend:aa_changeI>T              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeI>T          NA         NA      NA       NA
## DSSP310 Helix:aa_changeI>T         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeI>T           NA         NA      NA       NA
## DSSPβ-strand:aa_changeI>T          NA         NA      NA       NA
## DSSPTurn:aa_changeI>T              NA         NA      NA       NA
## DSSPα-helix:aa_changeI>V           NA         NA      NA       NA
## DSSPBend:aa_changeI>V       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeI>V          NA         NA      NA       NA
## DSSP310 Helix:aa_changeI>V         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeI>V           NA         NA      NA       NA
## DSSPβ-strand:aa_changeI>V   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeI>V              NA         NA      NA       NA
## DSSPα-helix:aa_changeK>E    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeK>E              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeK>E          NA         NA      NA       NA
## DSSP310 Helix:aa_changeK>E         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeK>E           NA         NA      NA       NA
## DSSPβ-strand:aa_changeK>E   5.313e+01  4.112e+05       0        1
## DSSPTurn:aa_changeK>E              NA         NA      NA       NA
## DSSPα-helix:aa_changeK>N   -4.078e-08  4.362e+05       0        1
## DSSPBend:aa_changeK>N              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeK>N          NA         NA      NA       NA
## DSSP310 Helix:aa_changeK>N         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeK>N           NA         NA      NA       NA
## DSSPβ-strand:aa_changeK>N          NA         NA      NA       NA
## DSSPTurn:aa_changeK>N              NA         NA      NA       NA
## DSSPα-helix:aa_changeK>Q   -1.946e-07  5.036e+05       0        1
## DSSPBend:aa_changeK>Q              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeK>Q          NA         NA      NA       NA
## DSSP310 Helix:aa_changeK>Q         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeK>Q           NA         NA      NA       NA
## DSSPβ-strand:aa_changeK>Q          NA         NA      NA       NA
## DSSPTurn:aa_changeK>Q              NA         NA      NA       NA
## DSSPα-helix:aa_changeL>F           NA         NA      NA       NA
## DSSPBend:aa_changeL>F              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeL>F          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>F         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeL>F           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>F   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeL>F              NA         NA      NA       NA
## DSSPα-helix:aa_changeL>H           NA         NA      NA       NA
## DSSPBend:aa_changeL>H              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeL>H          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>H         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeL>H           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>H   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeL>H              NA         NA      NA       NA
## DSSPα-helix:aa_changeL>I    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeL>I              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeL>I          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>I         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeL>I           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>I   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeL>I              NA         NA      NA       NA
## DSSPα-helix:aa_changeL>M    5.313e+01  4.112e+05       0        1
## DSSPBend:aa_changeL>M              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeL>M          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>M         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeL>M           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>M   5.313e+01  3.251e+05       0        1
## DSSPTurn:aa_changeL>M              NA         NA      NA       NA
## DSSPα-helix:aa_changeL>P    5.313e+01  3.251e+05       0        1
## DSSPBend:aa_changeL>P       5.313e+01  4.112e+05       0        1
## DSSPβ-bridge:aa_changeL>P          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>P         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeL>P           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>P   5.313e+01  2.344e+05       0        1
## DSSPTurn:aa_changeL>P       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeL>Q    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeL>Q              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeL>Q          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>Q         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeL>Q           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>Q   5.313e+01  3.251e+05       0        1
## DSSPTurn:aa_changeL>Q              NA         NA      NA       NA
## DSSPα-helix:aa_changeL>R    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeL>R       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeL>R          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>R  5.313e+01  4.362e+05       0        1
## DSSPÏ€-helix:aa_changeL>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>R   5.313e+01  2.980e+05       0        1
## DSSPTurn:aa_changeL>R              NA         NA      NA       NA
## DSSPα-helix:aa_changeL>V           NA         NA      NA       NA
## DSSPBend:aa_changeL>V              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeL>V          NA         NA      NA       NA
## DSSP310 Helix:aa_changeL>V         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeL>V           NA         NA      NA       NA
## DSSPβ-strand:aa_changeL>V   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeL>V              NA         NA      NA       NA
## DSSPα-helix:aa_changeM>I    5.313e+01  3.561e+05       0        1
## DSSPBend:aa_changeM>I              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeM>I          NA         NA      NA       NA
## DSSP310 Helix:aa_changeM>I         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeM>I           NA         NA      NA       NA
## DSSPβ-strand:aa_changeM>I          NA         NA      NA       NA
## DSSPTurn:aa_changeM>I       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeM>K    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeM>K       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeM>K   5.313e+01  5.036e+05       0        1
## DSSP310 Helix:aa_changeM>K         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeM>K           NA         NA      NA       NA
## DSSPβ-strand:aa_changeM>K          NA         NA      NA       NA
## DSSPTurn:aa_changeM>K              NA         NA      NA       NA
## DSSPα-helix:aa_changeM>L           NA         NA      NA       NA
## DSSPBend:aa_changeM>L       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeM>L          NA         NA      NA       NA
## DSSP310 Helix:aa_changeM>L         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeM>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeM>L          NA         NA      NA       NA
## DSSPTurn:aa_changeM>L              NA         NA      NA       NA
## DSSPα-helix:aa_changeM>R    5.313e+01  3.561e+05       0        1
## DSSPBend:aa_changeM>R              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeM>R   5.313e+01  4.362e+05       0        1
## DSSP310 Helix:aa_changeM>R         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeM>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeM>R          NA         NA      NA       NA
## DSSPTurn:aa_changeM>R              NA         NA      NA       NA
## DSSPα-helix:aa_changeM>T           NA         NA      NA       NA
## DSSPBend:aa_changeM>T              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeM>T          NA         NA      NA       NA
## DSSP310 Helix:aa_changeM>T         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeM>T           NA         NA      NA       NA
## DSSPβ-strand:aa_changeM>T          NA         NA      NA       NA
## DSSPTurn:aa_changeM>T              NA         NA      NA       NA
## DSSPα-helix:aa_changeM>V           NA         NA      NA       NA
## DSSPBend:aa_changeM>V              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeM>V   4.616e-06  5.036e+05       0        1
## DSSP310 Helix:aa_changeM>V         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeM>V           NA         NA      NA       NA
## DSSPβ-strand:aa_changeM>V          NA         NA      NA       NA
## DSSPTurn:aa_changeM>V              NA         NA      NA       NA
## DSSPα-helix:aa_changeN>D    5.391e-09  5.036e+05       0        1
## DSSPBend:aa_changeN>D       1.929e-07  5.036e+05       0        1
## DSSPβ-bridge:aa_changeN>D          NA         NA      NA       NA
## DSSP310 Helix:aa_changeN>D         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeN>D           NA         NA      NA       NA
## DSSPβ-strand:aa_changeN>D  -1.518e-09  5.036e+05       0        1
## DSSPTurn:aa_changeN>D              NA         NA      NA       NA
## DSSPα-helix:aa_changeN>H           NA         NA      NA       NA
## DSSPBend:aa_changeN>H              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeN>H          NA         NA      NA       NA
## DSSP310 Helix:aa_changeN>H         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeN>H           NA         NA      NA       NA
## DSSPβ-strand:aa_changeN>H   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeN>H              NA         NA      NA       NA
## DSSPα-helix:aa_changeN>I    5.313e+01  4.112e+05       0        1
## DSSPBend:aa_changeN>I       5.313e+01  2.908e+05       0        1
## DSSPβ-bridge:aa_changeN>I          NA         NA      NA       NA
## DSSP310 Helix:aa_changeN>I         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeN>I           NA         NA      NA       NA
## DSSPβ-strand:aa_changeN>I   5.313e+01  3.251e+05       0        1
## DSSPTurn:aa_changeN>I       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeN>K    5.313e+01  3.982e+05       0        1
## DSSPBend:aa_changeN>K       5.313e+01  2.389e+05       0        1
## DSSPβ-bridge:aa_changeN>K          NA         NA      NA       NA
## DSSP310 Helix:aa_changeN>K         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeN>K           NA         NA      NA       NA
## DSSPβ-strand:aa_changeN>K   5.313e+01  3.084e+05       0        1
## DSSPTurn:aa_changeN>K       5.313e+01  3.084e+05       0        1
## DSSPα-helix:aa_changeN>S    1.811e-07  4.362e+05       0        1
## DSSPBend:aa_changeN>S              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeN>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeN>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeN>S           NA         NA      NA       NA
## DSSPβ-strand:aa_changeN>S          NA         NA      NA       NA
## DSSPTurn:aa_changeN>S              NA         NA      NA       NA
## DSSPα-helix:aa_changeN>T           NA         NA      NA       NA
## DSSPBend:aa_changeN>T       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeN>T          NA         NA      NA       NA
## DSSP310 Helix:aa_changeN>T         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeN>T           NA         NA      NA       NA
## DSSPβ-strand:aa_changeN>T          NA         NA      NA       NA
## DSSPTurn:aa_changeN>T              NA         NA      NA       NA
## DSSPα-helix:aa_changeN>Y   -1.799e-07  5.036e+05       0        1
## DSSPBend:aa_changeN>Y      -1.895e-07  3.982e+05       0        1
## DSSPβ-bridge:aa_changeN>Y          NA         NA      NA       NA
## DSSP310 Helix:aa_changeN>Y         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeN>Y           NA         NA      NA       NA
## DSSPβ-strand:aa_changeN>Y          NA         NA      NA       NA
## DSSPTurn:aa_changeN>Y              NA         NA      NA       NA
## DSSPα-helix:aa_changeP>A           NA         NA      NA       NA
## DSSPBend:aa_changeP>A       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeP>A          NA         NA      NA       NA
## DSSP310 Helix:aa_changeP>A  5.313e+01  4.362e+05       0        1
## DSSPÏ€-helix:aa_changeP>A           NA         NA      NA       NA
## DSSPβ-strand:aa_changeP>A          NA         NA      NA       NA
## DSSPTurn:aa_changeP>A              NA         NA      NA       NA
## DSSPα-helix:aa_changeP>H           NA         NA      NA       NA
## DSSPBend:aa_changeP>H              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeP>H          NA         NA      NA       NA
## DSSP310 Helix:aa_changeP>H         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeP>H           NA         NA      NA       NA
## DSSPβ-strand:aa_changeP>H          NA         NA      NA       NA
## DSSPTurn:aa_changeP>H       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeP>L           NA         NA      NA       NA
## DSSPBend:aa_changeP>L       5.313e+01  2.299e+05       0        1
## DSSPβ-bridge:aa_changeP>L          NA         NA      NA       NA
## DSSP310 Helix:aa_changeP>L         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeP>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeP>L   5.313e+01  3.847e+05       0        1
## DSSPTurn:aa_changeP>L       5.313e+01  2.908e+05       0        1
## DSSPα-helix:aa_changeP>Q           NA         NA      NA       NA
## DSSPBend:aa_changeP>Q       5.313e+01  3.735e+05       0        1
## DSSPβ-bridge:aa_changeP>Q          NA         NA      NA       NA
## DSSP310 Helix:aa_changeP>Q         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeP>Q           NA         NA      NA       NA
## DSSPβ-strand:aa_changeP>Q   5.313e+01  3.735e+05       0        1
## DSSPTurn:aa_changeP>Q       5.313e+01  3.735e+05       0        1
## DSSPα-helix:aa_changeP>R           NA         NA      NA       NA
## DSSPBend:aa_changeP>R       5.313e+01  3.901e+05       0        1
## DSSPβ-bridge:aa_changeP>R          NA         NA      NA       NA
## DSSP310 Helix:aa_changeP>R         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeP>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeP>R          NA         NA      NA       NA
## DSSPTurn:aa_changeP>R       5.313e+01  3.901e+05       0        1
## DSSPα-helix:aa_changeP>S    5.313e+01  4.112e+05       0        1
## DSSPBend:aa_changeP>S       5.313e+01  4.112e+05       0        1
## DSSPβ-bridge:aa_changeP>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeP>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeP>S           NA         NA      NA       NA
## DSSPβ-strand:aa_changeP>S          NA         NA      NA       NA
## DSSPTurn:aa_changeP>S       5.313e+01  3.251e+05       0        1
## DSSPα-helix:aa_changeP>T           NA         NA      NA       NA
## DSSPBend:aa_changeP>T       5.313e+01  2.601e+05       0        1
## DSSPβ-bridge:aa_changeP>T          NA         NA      NA       NA
## DSSP310 Helix:aa_changeP>T         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeP>T           NA         NA      NA       NA
## DSSPβ-strand:aa_changeP>T   5.313e+01  3.901e+05       0        1
## DSSPTurn:aa_changeP>T       5.313e+01  2.980e+05       0        1
## DSSPα-helix:aa_changeQ>E   -4.443e-06  4.362e+05       0        1
## DSSPBend:aa_changeQ>E              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeQ>E          NA         NA      NA       NA
## DSSP310 Helix:aa_changeQ>E         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeQ>E           NA         NA      NA       NA
## DSSPβ-strand:aa_changeQ>E  -3.677e-08  4.362e+05       0        1
## DSSPTurn:aa_changeQ>E              NA         NA      NA       NA
## DSSPα-helix:aa_changeQ>H    5.313e+01  3.251e+05       0        1
## DSSPBend:aa_changeQ>H              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeQ>H          NA         NA      NA       NA
## DSSP310 Helix:aa_changeQ>H         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeQ>H           NA         NA      NA       NA
## DSSPβ-strand:aa_changeQ>H   5.313e+01  2.908e+05       0        1
## DSSPTurn:aa_changeQ>H       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeQ>K    5.313e+01  2.980e+05       0        1
## DSSPBend:aa_changeQ>K              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeQ>K          NA         NA      NA       NA
## DSSP310 Helix:aa_changeQ>K         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeQ>K           NA         NA      NA       NA
## DSSPβ-strand:aa_changeQ>K   5.313e+01  2.980e+05       0        1
## DSSPTurn:aa_changeQ>K       5.313e+01  3.901e+05       0        1
## DSSPα-helix:aa_changeQ>L           NA         NA      NA       NA
## DSSPBend:aa_changeQ>L              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeQ>L          NA         NA      NA       NA
## DSSP310 Helix:aa_changeQ>L         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeQ>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeQ>L          NA         NA      NA       NA
## DSSPTurn:aa_changeQ>L              NA         NA      NA       NA
## DSSPα-helix:aa_changeQ>P           NA         NA      NA       NA
## DSSPBend:aa_changeQ>P              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeQ>P          NA         NA      NA       NA
## DSSP310 Helix:aa_changeQ>P         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeQ>P           NA         NA      NA       NA
## DSSPβ-strand:aa_changeQ>P   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeQ>P       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeQ>R    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeQ>R              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeQ>R          NA         NA      NA       NA
## DSSP310 Helix:aa_changeQ>R  5.313e+01  5.036e+05       0        1
## DSSPÏ€-helix:aa_changeQ>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeQ>R   5.313e+01  5.036e+05       0        1
## DSSPTurn:aa_changeQ>R       5.313e+01  5.036e+05       0        1
## DSSPα-helix:aa_changeR>C    5.313e+01  2.908e+05       0        1
## DSSPBend:aa_changeR>C       5.313e+01  3.251e+05       0        1
## DSSPβ-bridge:aa_changeR>C   5.313e+01  4.112e+05       0        1
## DSSP310 Helix:aa_changeR>C         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeR>C           NA         NA      NA       NA
## DSSPβ-strand:aa_changeR>C   5.313e+01  2.601e+05       0        1
## DSSPTurn:aa_changeR>C       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeR>G           NA         NA      NA       NA
## DSSPBend:aa_changeR>G              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeR>G   5.313e+01  4.362e+05       0        1
## DSSP310 Helix:aa_changeR>G         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeR>G           NA         NA      NA       NA
## DSSPβ-strand:aa_changeR>G   5.313e+01  2.980e+05       0        1
## DSSPTurn:aa_changeR>G              NA         NA      NA       NA
## DSSPα-helix:aa_changeR>H    5.313e+01  3.251e+05       0        1
## DSSPBend:aa_changeR>H       5.313e+01  3.561e+05       0        1
## DSSPβ-bridge:aa_changeR>H   5.313e+01  4.362e+05       0        1
## DSSP310 Helix:aa_changeR>H         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeR>H           NA         NA      NA       NA
## DSSPβ-strand:aa_changeR>H   5.313e+01  3.561e+05       0        1
## DSSPTurn:aa_changeR>H       5.313e+01  3.561e+05       0        1
## DSSPα-helix:aa_changeR>L    5.313e+01  3.084e+05       0        1
## DSSPBend:aa_changeR>L       5.313e+01  3.084e+05       0        1
## DSSPβ-bridge:aa_changeR>L   5.313e+01  3.982e+05       0        1
## DSSP310 Helix:aa_changeR>L  5.313e+01  3.982e+05       0        1
## DSSPÏ€-helix:aa_changeR>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeR>L   5.313e+01  2.232e+05       0        1
## DSSPTurn:aa_changeR>L              NA         NA      NA       NA
## DSSPα-helix:aa_changeR>P    5.313e+01  3.251e+05       0        1
## DSSPBend:aa_changeR>P       5.313e+01  2.908e+05       0        1
## DSSPβ-bridge:aa_changeR>P   5.313e+01  4.112e+05       0        1
## DSSP310 Helix:aa_changeR>P  5.313e+01  4.112e+05       0        1
## DSSPÏ€-helix:aa_changeR>P    5.313e+01  4.112e+05       0        1
## DSSPβ-strand:aa_changeR>P   5.313e+01  2.518e+05       0        1
## DSSPTurn:aa_changeR>P       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeR>Q           NA         NA      NA       NA
## DSSPBend:aa_changeR>Q              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeR>Q          NA         NA      NA       NA
## DSSP310 Helix:aa_changeR>Q         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeR>Q           NA         NA      NA       NA
## DSSPβ-strand:aa_changeR>Q          NA         NA      NA       NA
## DSSPTurn:aa_changeR>Q              NA         NA      NA       NA
## DSSPα-helix:aa_changeR>S    5.313e+01  3.901e+05       0        1
## DSSPBend:aa_changeR>S       5.313e+01  3.901e+05       0        1
## DSSPβ-bridge:aa_changeR>S   5.313e+01  2.980e+05       0        1
## DSSP310 Helix:aa_changeR>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeR>S    5.313e+01  3.901e+05       0        1
## DSSPβ-strand:aa_changeR>S   5.313e+01  2.980e+05       0        1
## DSSPTurn:aa_changeR>S              NA         NA      NA       NA
## DSSPα-helix:aa_changeR>W           NA         NA      NA       NA
## DSSPBend:aa_changeR>W              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeR>W          NA         NA      NA       NA
## DSSP310 Helix:aa_changeR>W         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeR>W           NA         NA      NA       NA
## DSSPβ-strand:aa_changeR>W          NA         NA      NA       NA
## DSSPTurn:aa_changeR>W              NA         NA      NA       NA
## DSSPα-helix:aa_changeS>C           NA         NA      NA       NA
## DSSPBend:aa_changeS>C       5.313e+01  3.561e+05       0        1
## DSSPβ-bridge:aa_changeS>C          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>C         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>C           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>C   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeS>C              NA         NA      NA       NA
## DSSPα-helix:aa_changeS>F           NA         NA      NA       NA
## DSSPBend:aa_changeS>F              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>F          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>F         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>F           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>F          NA         NA      NA       NA
## DSSPTurn:aa_changeS>F              NA         NA      NA       NA
## DSSPα-helix:aa_changeS>G           NA         NA      NA       NA
## DSSPBend:aa_changeS>G              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>G          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>G         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>G           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>G          NA         NA      NA       NA
## DSSPTurn:aa_changeS>G       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeS>I           NA         NA      NA       NA
## DSSPBend:aa_changeS>I              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>I          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>I         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>I           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>I          NA         NA      NA       NA
## DSSPTurn:aa_changeS>I       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeS>L           NA         NA      NA       NA
## DSSPBend:aa_changeS>L       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeS>L          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>L         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>L   5.313e+01  4.112e+05       0        1
## DSSPTurn:aa_changeS>L       5.313e+01  5.036e+05       0        1
## DSSPα-helix:aa_changeS>N           NA         NA      NA       NA
## DSSPBend:aa_changeS>N              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>N          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>N         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>N           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>N   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeS>N       5.313e+01  4.362e+05       0        1
## DSSPα-helix:aa_changeS>P           NA         NA      NA       NA
## DSSPBend:aa_changeS>P              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>P   5.313e+01  3.982e+05       0        1
## DSSP310 Helix:aa_changeS>P         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>P           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>P   5.313e+01  2.720e+05       0        1
## DSSPTurn:aa_changeS>P              NA         NA      NA       NA
## DSSPα-helix:aa_changeS>R           NA         NA      NA       NA
## DSSPBend:aa_changeS>R              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>R          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>R         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>R   5.313e+01  2.518e+05       0        1
## DSSPTurn:aa_changeS>R       5.313e+01  2.908e+05       0        1
## DSSPα-helix:aa_changeS>T           NA         NA      NA       NA
## DSSPBend:aa_changeS>T              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>T          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>T         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>T           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>T          NA         NA      NA       NA
## DSSPTurn:aa_changeS>T              NA         NA      NA       NA
## DSSPα-helix:aa_changeS>W           NA         NA      NA       NA
## DSSPBend:aa_changeS>W      -4.617e-06  5.036e+05       0        1
## DSSPβ-bridge:aa_changeS>W          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>W         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>W           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>W   2.336e-08  5.036e+05       0        1
## DSSPTurn:aa_changeS>W              NA         NA      NA       NA
## DSSPα-helix:aa_changeS>Y           NA         NA      NA       NA
## DSSPBend:aa_changeS>Y              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeS>Y          NA         NA      NA       NA
## DSSP310 Helix:aa_changeS>Y         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeS>Y           NA         NA      NA       NA
## DSSPβ-strand:aa_changeS>Y          NA         NA      NA       NA
## DSSPTurn:aa_changeS>Y              NA         NA      NA       NA
## DSSPα-helix:aa_changeT>A    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeT>A              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeT>A          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>A         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>A           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>A   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeT>A              NA         NA      NA       NA
## DSSPα-helix:aa_changeT>I           NA         NA      NA       NA
## DSSPBend:aa_changeT>I       4.447e-06  5.036e+05       0        1
## DSSPβ-bridge:aa_changeT>I          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>I         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>I           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>I          NA         NA      NA       NA
## DSSPTurn:aa_changeT>I              NA         NA      NA       NA
## DSSPα-helix:aa_changeT>K    2.294e-07  4.112e+05       0        1
## DSSPBend:aa_changeT>K              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeT>K          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>K         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>K           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>K          NA         NA      NA       NA
## DSSPTurn:aa_changeT>K              NA         NA      NA       NA
## DSSPα-helix:aa_changeT>M   -4.420e-06  5.036e+05       0        1
## DSSPBend:aa_changeT>M      -4.856e-10  5.036e+05       0        1
## DSSPβ-bridge:aa_changeT>M          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>M         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>M           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>M          NA         NA      NA       NA
## DSSPTurn:aa_changeT>M              NA         NA      NA       NA
## DSSPα-helix:aa_changeT>N           NA         NA      NA       NA
## DSSPBend:aa_changeT>N              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeT>N          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>N         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>N           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>N          NA         NA      NA       NA
## DSSPTurn:aa_changeT>N              NA         NA      NA       NA
## DSSPα-helix:aa_changeT>P           NA         NA      NA       NA
## DSSPBend:aa_changeT>P              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeT>P          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>P         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>P           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>P   5.313e+01  2.908e+05       0        1
## DSSPTurn:aa_changeT>P       5.313e+01  4.112e+05       0        1
## DSSPα-helix:aa_changeT>R           NA         NA      NA       NA
## DSSPBend:aa_changeT>R              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeT>R          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>R         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>R          NA         NA      NA       NA
## DSSPTurn:aa_changeT>R              NA         NA      NA       NA
## DSSPα-helix:aa_changeT>S    5.313e+01  5.036e+05       0        1
## DSSPBend:aa_changeT>S              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeT>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeT>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeT>S           NA         NA      NA       NA
## DSSPβ-strand:aa_changeT>S   5.313e+01  4.362e+05       0        1
## DSSPTurn:aa_changeT>S              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>A           NA         NA      NA       NA
## DSSPBend:aa_changeV>A              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeV>A          NA         NA      NA       NA
## DSSP310 Helix:aa_changeV>A         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>A           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>A          NA         NA      NA       NA
## DSSPTurn:aa_changeV>A              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>D           NA         NA      NA       NA
## DSSPBend:aa_changeV>D              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeV>D          NA         NA      NA       NA
## DSSP310 Helix:aa_changeV>D         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>D           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>D          NA         NA      NA       NA
## DSSPTurn:aa_changeV>D              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>E           NA         NA      NA       NA
## DSSPBend:aa_changeV>E              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeV>E          NA         NA      NA       NA
## DSSP310 Helix:aa_changeV>E         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>E           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>E          NA         NA      NA       NA
## DSSPTurn:aa_changeV>E              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>F           NA         NA      NA       NA
## DSSPBend:aa_changeV>F      -4.426e-06  3.901e+05       0        1
## DSSPβ-bridge:aa_changeV>F          NA         NA      NA       NA
## DSSP310 Helix:aa_changeV>F         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>F           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>F          NA         NA      NA       NA
## DSSPTurn:aa_changeV>F              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>G           NA         NA      NA       NA
## DSSPBend:aa_changeV>G              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeV>G   1.727e-18  5.036e+05       0        1
## DSSP310 Helix:aa_changeV>G         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>G           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>G          NA         NA      NA       NA
## DSSPTurn:aa_changeV>G              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>I           NA         NA      NA       NA
## DSSPBend:aa_changeV>I              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeV>I          NA         NA      NA       NA
## DSSP310 Helix:aa_changeV>I         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>I           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>I   5.313e+01  3.982e+05       0        1
## DSSPTurn:aa_changeV>I              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>L   -4.420e-06  3.901e+05       0        1
## DSSPBend:aa_changeV>L       2.120e-07  2.980e+05       0        1
## DSSPβ-bridge:aa_changeV>L          NA         NA      NA       NA
## DSSP310 Helix:aa_changeV>L         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>L          NA         NA      NA       NA
## DSSPTurn:aa_changeV>L              NA         NA      NA       NA
## DSSPα-helix:aa_changeV>M           NA         NA      NA       NA
## DSSPBend:aa_changeV>M       4.419e-06  3.561e+05       0        1
## DSSPβ-bridge:aa_changeV>M          NA         NA      NA       NA
## DSSP310 Helix:aa_changeV>M         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeV>M           NA         NA      NA       NA
## DSSPβ-strand:aa_changeV>M          NA         NA      NA       NA
## DSSPTurn:aa_changeV>M              NA         NA      NA       NA
## DSSPα-helix:aa_changeW>C    5.439e-09  2.908e+05       0        1
## DSSPBend:aa_changeW>C       2.001e-07  2.908e+05       0        1
## DSSPβ-bridge:aa_changeW>C  -4.637e-06  4.112e+05       0        1
## DSSP310 Helix:aa_changeW>C         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeW>C           NA         NA      NA       NA
## DSSPβ-strand:aa_changeW>C          NA         NA      NA       NA
## DSSPTurn:aa_changeW>C              NA         NA      NA       NA
## DSSPα-helix:aa_changeW>G           NA         NA      NA       NA
## DSSPBend:aa_changeW>G       4.434e-06  4.362e+05       0        1
## DSSPβ-bridge:aa_changeW>G          NA         NA      NA       NA
## DSSP310 Helix:aa_changeW>G         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeW>G           NA         NA      NA       NA
## DSSPβ-strand:aa_changeW>G          NA         NA      NA       NA
## DSSPTurn:aa_changeW>G              NA         NA      NA       NA
## DSSPα-helix:aa_changeW>L    4.415e-06  4.112e+05       0        1
## DSSPBend:aa_changeW>L       4.609e-06  5.036e+05       0        1
## DSSPβ-bridge:aa_changeW>L   4.410e-06  5.036e+05       0        1
## DSSP310 Helix:aa_changeW>L -6.715e-23  5.036e+05       0        1
## DSSPÏ€-helix:aa_changeW>L           NA         NA      NA       NA
## DSSPβ-strand:aa_changeW>L          NA         NA      NA       NA
## DSSPTurn:aa_changeW>L              NA         NA      NA       NA
## DSSPα-helix:aa_changeW>R   -7.913e-17  3.982e+05       0        1
## DSSPBend:aa_changeW>R       1.334e-08  4.362e+05       0        1
## DSSPβ-bridge:aa_changeW>R   2.027e-07  5.036e+05       0        1
## DSSP310 Helix:aa_changeW>R         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeW>R           NA         NA      NA       NA
## DSSPβ-strand:aa_changeW>R   1.021e-08  3.901e+05       0        1
## DSSPTurn:aa_changeW>R              NA         NA      NA       NA
## DSSPα-helix:aa_changeW>S   -4.410e-06  5.036e+05       0        1
## DSSPBend:aa_changeW>S              NA         NA      NA       NA
## DSSPβ-bridge:aa_changeW>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeW>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeW>S           NA         NA      NA       NA
## DSSPβ-strand:aa_changeW>S          NA         NA      NA       NA
## DSSPTurn:aa_changeW>S              NA         NA      NA       NA
## DSSPα-helix:aa_changeY>C    5.313e+01  4.362e+05       0        1
## DSSPBend:aa_changeY>C       5.313e+01  4.362e+05       0        1
## DSSPβ-bridge:aa_changeY>C          NA         NA      NA       NA
## DSSP310 Helix:aa_changeY>C         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeY>C    5.313e+01  4.362e+05       0        1
## DSSPβ-strand:aa_changeY>C   5.313e+01  3.561e+05       0        1
## DSSPTurn:aa_changeY>C              NA         NA      NA       NA
## DSSPα-helix:aa_changeY>D           NA         NA      NA       NA
## DSSPBend:aa_changeY>D      -2.003e-07  4.362e+05       0        1
## DSSPβ-bridge:aa_changeY>D          NA         NA      NA       NA
## DSSP310 Helix:aa_changeY>D         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeY>D           NA         NA      NA       NA
## DSSPβ-strand:aa_changeY>D          NA         NA      NA       NA
## DSSPTurn:aa_changeY>D              NA         NA      NA       NA
## DSSPα-helix:aa_changeY>F           NA         NA      NA       NA
## DSSPBend:aa_changeY>F      -2.250e-09  4.362e+05       0        1
## DSSPβ-bridge:aa_changeY>F          NA         NA      NA       NA
## DSSP310 Helix:aa_changeY>F         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeY>F           NA         NA      NA       NA
## DSSPβ-strand:aa_changeY>F          NA         NA      NA       NA
## DSSPTurn:aa_changeY>F              NA         NA      NA       NA
## DSSPα-helix:aa_changeY>H           NA         NA      NA       NA
## DSSPBend:aa_changeY>H       2.244e-29  5.036e+05       0        1
## DSSPβ-bridge:aa_changeY>H          NA         NA      NA       NA
## DSSP310 Helix:aa_changeY>H         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeY>H           NA         NA      NA       NA
## DSSPβ-strand:aa_changeY>H          NA         NA      NA       NA
## DSSPTurn:aa_changeY>H              NA         NA      NA       NA
## DSSPα-helix:aa_changeY>N    5.092e-29  4.362e+05       0        1
## DSSPBend:aa_changeY>N       5.683e-30  5.036e+05       0        1
## DSSPβ-bridge:aa_changeY>N          NA         NA      NA       NA
## DSSP310 Helix:aa_changeY>N         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeY>N           NA         NA      NA       NA
## DSSPβ-strand:aa_changeY>N          NA         NA      NA       NA
## DSSPTurn:aa_changeY>N              NA         NA      NA       NA
## DSSPα-helix:aa_changeY>S           NA         NA      NA       NA
## DSSPBend:aa_changeY>S       5.313e+01  5.036e+05       0        1
## DSSPβ-bridge:aa_changeY>S          NA         NA      NA       NA
## DSSP310 Helix:aa_changeY>S         NA         NA      NA       NA
## DSSPÏ€-helix:aa_changeY>S           NA         NA      NA       NA
## DSSPβ-strand:aa_changeY>S          NA         NA      NA       NA
## DSSPTurn:aa_changeY>S              NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9.9393e+02  on 861  degrees of freedom
## Residual deviance: 5.0010e-09  on 435  degrees of freedom
## AIC: 854
## 
## Number of Fisher Scoring iterations: 25
plot(model)

results_specific <- as.data.frame(summary(model)$coefficient)
results_specific$Odds_ratio <- exp(results_specific[["Estimate"]])
results_specific_sig <- results_specific %>%
  dplyr::filter(`Pr(>|z|)` < 0.05) %>%
  dplyr::arrange(Odds_ratio)
# No significant interactions



model_is_ss <- glm(is_ss ~ grantham_distance, data = structure_analysis_specific, family = binomial)
summary(model_is_ss)
## 
## Call:
## glm(formula = is_ss ~ grantham_distance, family = binomial, data = structure_analysis_specific)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.710002   0.160841   4.414 1.01e-05 ***
## grantham_distance 0.003643   0.001648   2.210   0.0271 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.93  on 861  degrees of freedom
## Residual deviance: 988.94  on 860  degrees of freedom
## AIC: 992.94
## 
## Number of Fisher Scoring iterations: 4
ggplot(structure_analysis_specific, aes(x = is_ss, y = grantham_distance)) +
  geom_violin(alpha = 0.5) +
  geom_jitter(position = position_jitter(seed = 1, width = 0.2), alpha = 0.5) +
  theme_bw() +
  xlab("Secondary structure") +
  ylab("Grantham distance")

# Fit regression model
# Is the amino acid substitution important for the type of SS?
model_specific <- glm(DSSP ~ 0 + aa_change, data = structure_analysis_specific, family = binomial)

# Get summary of the model
summary(model_specific)
## 
## Call:
## glm(formula = DSSP ~ 0 + aa_change, family = binomial, data = structure_analysis_specific)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)   
## aa_changeA>D  1.857e+01  2.663e+03   0.007  0.99444   
## aa_changeA>E -1.404e-16  6.325e-01   0.000  1.00000   
## aa_changeA>G  2.877e-01  7.638e-01   0.377  0.70642   
## aa_changeA>P  5.596e-01  6.268e-01   0.893  0.37194   
## aa_changeA>S  9.808e-01  6.770e-01   1.449  0.14740   
## aa_changeA>T  9.163e-01  8.367e-01   1.095  0.27344   
## aa_changeA>V  1.609e+00  1.095e+00   1.469  0.14178   
## aa_changeC>F  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeC>R  6.931e-01  8.660e-01   0.800  0.42349   
## aa_changeC>S  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeC>W  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeC>Y  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeD>A  1.523e-17  8.165e-01   0.000  1.00000   
## aa_changeD>E  1.504e+00  7.817e-01   1.924  0.05435 . 
## aa_changeD>G  1.857e+01  3.261e+03   0.006  0.99546   
## aa_changeD>H  4.055e-01  9.129e-01   0.444  0.65692   
## aa_changeD>N  1.030e+00  5.210e-01   1.976  0.04812 * 
## aa_changeD>V  1.946e+00  1.069e+00   1.820  0.06872 . 
## aa_changeD>Y  1.386e+00  6.455e-01   2.148  0.03174 * 
## aa_changeE>A  6.931e-01  1.225e+00   0.566  0.57143   
## aa_changeE>D  1.946e+00  1.069e+00   1.820  0.06872 . 
## aa_changeE>G  1.946e+00  1.069e+00   1.820  0.06872 . 
## aa_changeE>K  1.540e+00  6.362e-01   2.421  0.01547 * 
## aa_changeE>Q  1.099e+00  1.155e+00   0.951  0.34139   
## aa_changeE>V  1.253e+00  8.018e-01   1.562  0.11818   
## aa_changeF>C -1.857e+01  6.523e+03  -0.003  0.99773   
## aa_changeF>I  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeF>L  0.000e+00  8.165e-01   0.000  1.00000   
## aa_changeF>V -1.901e-17  1.414e+00   0.000  1.00000   
## aa_changeF>Y  1.857e+01  6.523e+03   0.003  0.99773   
## aa_changeG>A  1.386e+00  1.118e+00   1.240  0.21500   
## aa_changeG>C  1.299e+00  6.513e-01   1.995  0.04607 * 
## aa_changeG>D  6.190e-01  4.688e-01   1.320  0.18668   
## aa_changeG>E  1.609e+00  1.095e+00   1.469  0.14178   
## aa_changeG>R  9.445e-01  4.454e-01   2.120  0.03398 * 
## aa_changeG>S  1.386e+00  5.590e-01   2.480  0.01314 * 
## aa_changeG>V  1.099e+00  4.082e-01   2.691  0.00712 **
## aa_changeG>W  6.931e-01  1.225e+00   0.566  0.57143   
## aa_changeH>D  6.931e-01  1.225e+00   0.566  0.57143   
## aa_changeH>L  6.931e-01  8.660e-01   0.800  0.42349   
## aa_changeH>N  6.931e-01  1.225e+00   0.566  0.57143   
## aa_changeH>P  9.163e-01  8.367e-01   1.095  0.27344   
## aa_changeH>Q  5.108e-01  7.303e-01   0.699  0.48425   
## aa_changeH>R -2.877e-01  7.638e-01  -0.377  0.70642   
## aa_changeH>Y -4.055e-01  9.129e-01  -0.444  0.65692   
## aa_changeI>F  1.792e+00  1.080e+00   1.659  0.09715 . 
## aa_changeI>M -1.857e+01  6.523e+03  -0.003  0.99773   
## aa_changeI>N  1.857e+01  2.306e+03   0.008  0.99358   
## aa_changeI>T  1.857e+01  6.523e+03   0.003  0.99773   
## aa_changeI>V  0.000e+00  1.000e+00   0.000  1.00000   
## aa_changeK>E  1.386e+00  1.118e+00   1.240  0.21500   
## aa_changeK>N  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeK>Q  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeL>F -3.140e-16  1.414e+00   0.000  1.00000   
## aa_changeL>H  1.570e-16  1.414e+00   0.000  1.00000   
## aa_changeL>I  6.931e-01  1.225e+00   0.566  0.57143   
## aa_changeL>M  0.000e+00  8.165e-01   0.000  1.00000   
## aa_changeL>P  1.540e+00  6.362e-01   2.421  0.01547 * 
## aa_changeL>Q  6.931e-01  8.660e-01   0.800  0.42349   
## aa_changeL>R  1.386e+00  7.906e-01   1.754  0.07951 . 
## aa_changeL>V  6.931e-01  1.225e+00   0.566  0.57143   
## aa_changeM>I  4.055e-01  9.129e-01   0.444  0.65692   
## aa_changeM>K  1.099e+00  1.155e+00   0.951  0.34139   
## aa_changeM>L  6.931e-01  1.225e+00   0.566  0.57143   
## aa_changeM>R  4.055e-01  9.129e-01   0.444  0.65692   
## aa_changeM>T  1.857e+01  6.523e+03   0.003  0.99773   
## aa_changeM>V  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeN>D  1.857e+01  3.261e+03   0.006  0.99546   
## aa_changeN>H  0.000e+00  1.414e+00   0.000  1.00000   
## aa_changeN>I  8.473e-01  6.901e-01   1.228  0.21950   
## aa_changeN>K  9.163e-01  5.916e-01   1.549  0.12143   
## aa_changeN>S  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeN>T -6.931e-01  1.225e+00  -0.566  0.57143   
## aa_changeN>Y  1.857e+01  2.663e+03   0.007  0.99444   
## aa_changeP>A  0.000e+00  1.000e+00   0.000  1.00000   
## aa_changeP>H -6.931e-01  1.225e+00  -0.566  0.57143   
## aa_changeP>L  1.542e-01  5.563e-01   0.277  0.78172   
## aa_changeP>Q -1.204e+00  6.583e-01  -1.829  0.06740 . 
## aa_changeP>R -9.163e-01  8.367e-01  -1.095  0.27344   
## aa_changeP>S  2.877e-01  7.638e-01   0.377  0.70642   
## aa_changeP>T  1.823e-01  6.055e-01   0.301  0.76334   
## aa_changeQ>E  1.857e+01  3.261e+03   0.006  0.99546   
## aa_changeQ>H  1.609e+00  7.746e-01   2.078  0.03773 * 
## aa_changeQ>K  0.000e+00  6.325e-01   0.000  1.00000   
## aa_changeQ>L -1.857e+01  3.766e+03  -0.005  0.99607   
## aa_changeQ>P  1.386e+00  1.118e+00   1.240  0.21500   
## aa_changeQ>R  1.386e+00  1.118e+00   1.240  0.21500   
## aa_changeR>C  1.386e+00  6.455e-01   2.148  0.03174 * 
## aa_changeR>G  1.099e+00  8.165e-01   1.346  0.17846   
## aa_changeR>H  1.609e+00  7.746e-01   2.078  0.03773 * 
## aa_changeR>L  1.179e+00  5.718e-01   2.061  0.03926 * 
## aa_changeR>P  1.609e+00  6.325e-01   2.545  0.01094 * 
## aa_changeR>Q  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeR>S  3.365e-01  5.855e-01   0.575  0.56554   
## aa_changeR>W  1.857e+01  6.523e+03   0.003  0.99773   
## aa_changeS>C  4.055e-01  9.129e-01   0.444  0.65692   
## aa_changeS>F  1.857e+01  6.523e+03   0.003  0.99773   
## aa_changeS>G -6.931e-01  1.225e+00  -0.566  0.57143   
## aa_changeS>I -6.931e-01  1.225e+00  -0.566  0.57143   
## aa_changeS>L  1.609e+00  1.095e+00   1.469  0.14178   
## aa_changeS>N -5.551e-16  1.000e+00   0.000  1.00000   
## aa_changeS>P  7.850e-17  7.071e-01   0.000  1.00000   
## aa_changeS>R -1.823e-01  6.055e-01  -0.301  0.76334   
## aa_changeS>T -1.857e+01  6.523e+03  -0.003  0.99773   
## aa_changeS>W  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeS>Y  1.857e+01  6.523e+03   0.003  0.99773   
## aa_changeT>A  1.099e+00  1.155e+00   0.951  0.34139   
## aa_changeT>I  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeT>K  1.857e+01  3.261e+03   0.006  0.99546   
## aa_changeT>M  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeT>N -1.857e+01  6.523e+03  -0.003  0.99773   
## aa_changeT>P  2.877e-01  7.638e-01   0.377  0.70642   
## aa_changeT>R  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeT>S  1.099e+00  1.155e+00   0.951  0.34139   
## aa_changeV>A  1.857e+01  2.917e+03   0.006  0.99492   
## aa_changeV>D  1.857e+01  2.465e+03   0.008  0.99399   
## aa_changeV>E  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeV>F  1.857e+01  2.663e+03   0.007  0.99444   
## aa_changeV>G  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeV>I  1.386e+00  1.118e+00   1.240  0.21500   
## aa_changeV>L  1.857e+01  2.306e+03   0.008  0.99358   
## aa_changeV>M  1.857e+01  3.261e+03   0.006  0.99546   
## aa_changeW>C  1.857e+01  2.063e+03   0.009  0.99282   
## aa_changeW>G  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeW>L  1.857e+01  2.465e+03   0.008  0.99399   
## aa_changeW>R  1.857e+01  1.809e+03   0.010  0.99181   
## aa_changeW>S  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeY>C  9.163e-01  8.367e-01   1.095  0.27344   
## aa_changeY>D  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeY>F  1.857e+01  3.766e+03   0.005  0.99607   
## aa_changeY>H  1.857e+01  4.612e+03   0.004  0.99679   
## aa_changeY>N  1.857e+01  3.261e+03   0.006  0.99546   
## aa_changeY>S  0.000e+00  1.414e+00   0.000  1.00000   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1195.0  on 862  degrees of freedom
## Residual deviance:  798.2  on 729  degrees of freedom
## AIC: 1064.2
## 
## Number of Fisher Scoring iterations: 17
plot(model_specific)

results_specific <- as.data.frame(summary(model_specific)$coefficient)
rownames(results_specific) <- rownames(results_specific) %>% str_replace("aa_change", "")
results_specific$Odds_ratio <- exp(results_specific[["Estimate"]])
results_specific_sig <- results_specific %>%
  dplyr::filter(`Pr(>|z|)` < 0.05) %>%
  dplyr::arrange(Odds_ratio)
kableExtra::kbl(results_specific_sig, digits = 2)
Estimate Std. Error z value Pr(>|z|) Odds_ratio
G>R 0.94 0.45 2.12 0.03 2.57
D>N 1.03 0.52 1.98 0.05 2.80
G>V 1.10 0.41 2.69 0.01 3.00
R>L 1.18 0.57 2.06 0.04 3.25
G>C 1.30 0.65 1.99 0.05 3.67
G>S 1.39 0.56 2.48 0.01 4.00
D>Y 1.39 0.65 2.15 0.03 4.00
R>C 1.39 0.65 2.15 0.03 4.00
E>K 1.54 0.64 2.42 0.02 4.67
L>P 1.54 0.64 2.42 0.02 4.67
R>H 1.61 0.77 2.08 0.04 5.00
Q>H 1.61 0.77 2.08 0.04 5.00
R>P 1.61 0.63 2.54 0.01 5.00
results_specific_sig
##      Estimate Std. Error  z value    Pr(>|z|) Odds_ratio
## G>R 0.9444616  0.4454354 2.120311 0.033979822   2.571429
## D>N 1.0296194  0.5209881 1.976282 0.048122832   2.800000
## G>V 1.0986123  0.4082483 2.691040 0.007122975   3.000000
## R>L 1.1786550  0.5717719 2.061408 0.039264152   3.250000
## G>C 1.2992830  0.6513389 1.994788 0.046066029   3.666667
## G>S 1.3862944  0.5590170 2.479879 0.013142707   4.000000
## D>Y 1.3862944  0.6454972 2.147638 0.031742525   4.000000
## R>C 1.3862944  0.6454972 2.147638 0.031742525   4.000000
## E>K 1.5404450  0.6362090 2.421288 0.015465632   4.666667
## L>P 1.5404450  0.6362090 2.421288 0.015465632   4.666667
## R>H 1.6094379  0.7745967 2.077775 0.037730050   5.000000
## Q>H 1.6094379  0.7745967 2.077775 0.037730050   5.000000
## R>P 1.6094379  0.6324555 2.544745 0.010935764   5.000000

Amino acid substitutions and specific structural relationships

# Want to know whether we find more x to y substitutions at given features

ss_and_sub <- structure_analysis_specific %>%
  group_by(aa_change, DSSP, `Ref A.A.`, `Alt A.A.`) %>%
  tally()


# What is the composition of beta gal in terms of secondary structure
beta_gal_comp <- ss_df %>%
  mutate_at(c("DSSP"), ~ replace_na(., "Coil")) %>%
  group_by(DSSP) %>%
  tally(name = "n_beta_gal") %>%
  mutate(pct_beta_gal = round(100 * n_beta_gal / (sum(n_beta_gal)), digits = 1))

# What is the proportion of each secondary structure in terms of number of mutations recovered
mut_ss_comp <- structure_analysis_specific %>%
  dplyr::group_by(DSSP) %>%
  tally(name = "n_muts") %>%
  mutate(pct_in_muts = round(100 * n_muts / (sum(n_muts)), digits = 1))

composition <- left_join(beta_gal_comp, mut_ss_comp, by = "DSSP")

contingency_table <- matrix(c(composition$n_beta_gal, composition$n_muts), nrow = nrow(composition))
result <- chisq.test(contingency_table)
print(result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 19.16, df = 7, p-value = 0.0077
# Create a vector of the letters
letters_set <- c(ss_and_sub$`Ref A.A.`, ss_and_sub$`Alt A.A.`) %>% unique()

# Generate all possible combinations
combinations <- expand.grid(`Ref A.A.` = letters_set, `Alt A.A.` = letters_set)

# Make data frame
boxes <- data.frame(
  `Ref A.A.` = combinations[[1]],
  `Alt A.A.` = combinations[[2]],
  n = 0, check.names = F
)

max_n <- as.data.frame(ss_and_sub) %>%
  dplyr::filter(n > 1) %>%
  subset(!is.na(DSSP)) %>%
  pull(n) %>%
  max()

data_for_plot <- as.data.frame(ss_and_sub) %>%
  subset(!is.na(DSSP)) %>%
  group_by(DSSP) %>%
  mutate(label = paste0(DSSP, ", n=", sum(n)))

data_for_plot$label <- factor(
  data_for_plot$label,
  levels(factor(data_for_plot$label))[c(3, 5, 1, 8, 7, 6, 4, 2)]
)

coil_specific <- results_specific_sig %>%
  dplyr::filter(Odds_ratio < 1) %>%
  rownames()
ss_specific <- results_specific_sig %>%
  dplyr::filter(Odds_ratio > 1) %>%
  rownames()

ggplot(
  data_for_plot,
  aes(x = `Ref A.A.`, y = `Alt A.A.`, fill = n)
) +
  geom_tile(data = boxes, colour = "lightgrey", fill = "white", size = 0.05) +
  geom_tile(color = "grey") +
  # geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% coil_specific), fill = NA, color = "red", size = 0.5) +
  # geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% ss_specific), fill = NA, color = "green", size = 0.5) +
  facet_wrap2(~label, axes = "all", ncol = 4) +
  coord_fixed() +
  theme_bw() +
  xlab("WT Residue") +
  ylab("Mutant Residue") +
  scale_fill_viridis_c(
    option = "F",
    name = "# Observations",
    direction = -1,
    trans = "log",
    breaks = c(1, 5, 20, 50, max_n),
    labels = c(1, 5, 20, 50, max_n)
  ) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        strip.text.x = element_text(size = rel(1.3)),
        panel.border = element_rect(colour = "black", fill = NA))

ggplot(
  data_for_plot,
  aes(x = `Ref A.A.`, y = `Alt A.A.`, fill = n)
) +
  geom_tile(data = boxes, colour = "lightgrey", fill = "white", size = 0.05) +
  geom_tile(color = "grey") +
  geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% coil_specific), fill = NA, color = "red", size = 0.5) +
  geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% ss_specific), fill = NA, color = "green", size = 0.5) +
  coord_fixed() +
  theme_bw() +
  xlab("WT Residue") +
  ylab("Mutant Residue") +
  scale_fill_viridis_c(
    option = "F",
    name = "# Observations",
    direction = -1,
    trans = "log",
    breaks = c(1, 5, 20, 50, max_n),
    labels = c(1, 5, 20, 50, max_n)
  ) +
  theme(panel.grid.major = element_blank())

ggplot(
  data_for_plot %>% dplyr::filter(aa_change %in% rownames(results_specific_sig)) %>%
    dplyr::mutate(aa_change = factor(aa_change, levels = rownames(results_specific_sig))),
  aes(x = DSSP, y = n)
) +
  geom_col() +
  facet_wrap(~aa_change, scales = "free_y") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  xlab("Secondary structure class") +
  ylab("Observed substitutions")

structure_analysis %>%
  dplyr::group_by(DSSP, ) %>%
  tally()
## # A tibble: 8 × 2
##   DSSP          n
##   <fct>     <int>
## 1 Coil        227
## 2 α-helix      79
## 3 Bend        111
## 4 β-bridge     28
## 5 310 Helix    20
## 6 π-helix      15
## 7 β-strand    283
## 8 Turn         99
composition
## # A tibble: 8 × 5
##   DSSP      n_beta_gal pct_beta_gal n_muts pct_in_muts
##   <chr>          <int>        <dbl>  <int>       <dbl>
## 1 310 Helix         32          3.2     20         2.3
## 2 Bend             106         10.5    111        12.9
## 3 Coil             230         22.7    227        26.3
## 4 Turn             118         11.7     99        11.5
## 5 α-helix          104         10.3     79         9.2
## 6 β-bridge          24          2.4     28         3.2
## 7 β-strand         392         38.8    283        32.8
## 8 π-helix            5          0.5     15         1.7
composition$DSSP <- factor(
  composition$DSSP,
  levels(factor(composition$DSSP))[c(3, 5, 1, 8, 7, 6, 4, 2)]
)
ggplot(
  composition %>% pivot_longer(c(-DSSP, -n_beta_gal, -n_muts)),
  aes(x = DSSP, y = value, fill = name)
) +
  geom_col(position = "dodge") +
  theme_bw() +
  xlab("Secondary structure class") +
  ylab("Proportion of residues or substitutions") +
  scale_fill_discrete(name = element_blank(), labels = c("β-Gal Protein", "Mutation data")) +
  theme(legend.position = "bottom")

# How many changes to proline?

structure_analysis %>% dplyr::filter(`Alt A.A.` == "P")
## # A tibble: 73 × 27
## # Rowwise: 
##    Position `Ref A.A.` `Alt A.A.` Consequence Type  Codon aa_change PositionRef
##       <dbl> <chr>      <chr>      <chr>       <chr> <dbl> <chr>           <dbl>
##  1       19 S          P          missense    SNV       7 S>P                16
##  2      463 A          P          missense    SNV     149 A>P               445
##  3      629 R          P          missense    SNV     204 R>P               611
##  4      647 R          P          missense    SNV     210 R>P               629
##  5     1016 R          P          missense    SNV     333 R>P               998
##  6     1085 R          P          missense    SNV     356 R>P              1067
##  7     1174 A          P          missense    SNV     386 A>P              1156
##  8     1181 R          P          missense    SNV     388 R>P              1163
##  9     1229 R          P          missense    SNV     404 R>P              1211
## 10     1271 H          P          missense    SNV     418 H>P              1253
## # ℹ 63 more rows
## # ℹ 19 more variables: residue_code <chr>, alt_code <chr>, CodonRef <dbl>,
## #   residue_name <chr>, Domain <fct>, Buried.or.Exposed <chr>,
## #   NetSurf.Amino.Acid <chr>, Probability.for.Alpha.Helix <dbl>,
## #   Probability.for.Beta.strand <dbl>, Probability.for.Coil <dbl>,
## #   secondary_structure <chr>, grantham_distance <dbl>, conservative <lgl>,
## #   pdb_ss <fct>, Chain <chr>, ss_code <chr>, solvent_accessibility <dbl>, …
structure_analysis %>% dplyr::filter(`Alt A.A.` != "P")
## # A tibble: 789 × 27
## # Rowwise: 
##    Position `Ref A.A.` `Alt A.A.` Consequence Type  Codon aa_change PositionRef
##       <dbl> <chr>      <chr>      <chr>       <chr> <dbl> <chr>           <dbl>
##  1       41 P          H          missense    SNV       8 P>H                23
##  2       51 L          F          missense    SNV      11 L>F                33
##  3      109 P          T          missense    SNV      31 P>T                91
##  4      110 P          L          missense    SNV      31 P>L                92
##  5      112 P          T          missense    SNV      32 P>T                94
##  6      122 S          I          missense    SNV      35 S>I               104
##  7      130 N          Y          missense    SNV      38 N>Y               112
##  8      132 N          K          missense    SNV      38 N>K               114
##  9      138 E          D          missense    SNV      40 E>D               120
## 10      149 T          S          missense    SNV      44 T>S               131
## # ℹ 779 more rows
## # ℹ 19 more variables: residue_code <chr>, alt_code <chr>, CodonRef <dbl>,
## #   residue_name <chr>, Domain <fct>, Buried.or.Exposed <chr>,
## #   NetSurf.Amino.Acid <chr>, Probability.for.Alpha.Helix <dbl>,
## #   Probability.for.Beta.strand <dbl>, Probability.for.Coil <dbl>,
## #   secondary_structure <chr>, grantham_distance <dbl>, conservative <lgl>,
## #   pdb_ss <fct>, Chain <chr>, ss_code <chr>, solvent_accessibility <dbl>, …
# What are the

betagal_proportions <- as.data.frame(t(alphabetFrequency(laczref_aa))) %>%
  tibble::rownames_to_column() %>%
  dplyr::rename(
    "aa" = "rowname",
    "n_betagal" = "V1"
  )

wts <- structure_analysis %>%
  group_by(`Ref A.A.`) %>%
  tally() %>%
  dplyr::rename("aa" = `Ref A.A.`, n_wt = n)
subs <- structure_analysis %>%
  group_by(`Alt A.A.`) %>%
  tally() %>%
  dplyr::rename("aa" = `Alt A.A.`, n_mut = n)

aa_proportions_wt_and_muts <- left_join(betagal_proportions, wts) %>%
  left_join(subs) %>%
  na.omit() %>%
  mutate(pct_betagal = round(100 * n_betagal / (sum(n_betagal)), digits = 1)) %>%
  mutate(pct_wt = round(100 * n_wt / (sum(n_wt)), digits = 1)) %>%
  mutate(pct_mut = round(100 * n_mut / (sum(n_mut)), digits = 1)) %>%
  pivot_longer(cols = c(5:7), names_to = "source", values_to = "Percent of residues") %>%
  pivot_longer(cols = c(2:4), names_to = "source_count", values_to = "n")

aa_proportions_wt_and_muts$source <- factor(aa_proportions_wt_and_muts$source, levels = c("pct_betagal", "pct_wt", "pct_mut"))

ggplot(aa_proportions_wt_and_muts, aes(x = aa, y = `Percent of residues`, fill = source)) +
  geom_col(position = "dodge") +
  scale_fill_discrete(name = element_blank(), labels = c("β-Gal Protein", "Wild type residues", "Mutated residues")) +
  theme_bw() +
  facet_grid(~aa, scales = "free_x") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Amino acid")

structure_analysis_specific %>%
  group_by(aa_change) %>%
  tally() %>%
  arrange(-n)
## # A tibble: 133 × 2
##    aa_change     n
##    <fct>     <int>
##  1 G>V          32
##  2 G>R          25
##  3 G>D          20
##  4 G>S          20
##  5 D>N          19
##  6 R>P          18
##  7 E>K          17
##  8 L>P          17
##  9 R>L          17
## 10 D>Y          15
## # ℹ 123 more rows

Size of side chain and hydrophobicity

hydropathy <- idpr::KDNorm %>% dplyr::rename("hydropathy" = "V2")

aa_classes <- read.table("data/raw/physicochemical_classes.txt",
  sep = "\t",
  header = T
)

aa_classes_cleaned <- aa_classes %>%
  pivot_longer(-Amino.acid, names_to = "category") %>%
  separate_rows(value, sep = "\\(") %>%
  mutate(value = str_remove(value, "[()]")) %>%
  mutate(value = trimws(value)) %>%
  group_by(Amino.acid, category) %>%
  mutate(row_num = row_number()) %>%
  pivot_wider(names_from = category, values_from = value) %>%
  pivot_wider(names_from = row_num, values_from = c(3:9))

colnames(aa_classes_cleaned) <- colnames(aa_classes_cleaned) %>% str_remove("_1")
colnames(aa_classes_cleaned) <- colnames(aa_classes_cleaned) %>% str_replace_all("_2", " score")


structure_analysis_physicochemical <- structure_analysis_specific %>%
  left_join(hydropathy, by = c("Ref A.A." = "V1"), suffix = c("", ".WT")) %>%
  left_join(hydropathy, by = c("Alt A.A." = "V1"), suffix = c("", ".MUT")) %>%
  left_join(aa_classes_cleaned, by = c("Ref A.A." = "Amino.acid"), suffix = c("", ".WT")) %>%
  left_join(aa_classes_cleaned, by = c("Alt A.A." = "Amino.acid"), suffix = c("", ".MUT")) %>%
  mutate(across(contains("score"), as.numeric)) %>%
  mutate(
    Hydropathy_diff = `hydropathy.MUT` - `hydropathy`,
    Volume_diff = `Volume score.MUT` - `Volume score`,
    Chemical_diff = `Chemical score.MUT` - `Chemical score`,
    Physicochemical_diff = `Physicochemical score.MUT` - `Physicochemical score`,
    Charge_diff = `Charge score.MUT` - `Charge score`,
    Polarity_diff = `Polarity score.MUT` - `Polarity score`
  )

# Model the Volume_diff
model_vol <- glm(grantham_distance ~ Volume_diff, data = structure_analysis_physicochemical, family = quasipoisson)
summary(model_vol)
## 
## Call:
## glm(formula = grantham_distance ~ Volume_diff, family = quasipoisson, 
##     data = structure_analysis_physicochemical)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.49169    0.01865 240.870   <2e-16 ***
## Volume_diff  0.00826    0.01095   0.754    0.451    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 25.9943)
## 
##     Null deviance: 23402  on 861  degrees of freedom
## Residual deviance: 23387  on 860  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
ggplot(structure_analysis_physicochemical, aes(x = Volume_diff, y = grantham_distance)) +
  geom_jitter()

# this result is not interesting or surprising because the Grantham distance takes into account the physicochemical properties of the residue change.

# Model the Hydropathy_diff
model_hydro <- glm(grantham_distance ~ Hydropathy_diff, data = structure_analysis_physicochemical, family = quasipoisson)
summary(model_hydro)
## 
## Call:
## glm(formula = grantham_distance ~ Hydropathy_diff, family = quasipoisson, 
##     data = structure_analysis_physicochemical)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.48696    0.01834 244.612  < 2e-16 ***
## Hydropathy_diff  0.16635    0.04232   3.931 9.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 25.48792)
## 
##     Null deviance: 23402  on 861  degrees of freedom
## Residual deviance: 23008  on 860  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
ggplot(structure_analysis_physicochemical, aes(x = Hydropathy_diff, y = grantham_distance)) +
  geom_jitter()

# this result is not interesting or surprising because the Grantham distance takes into account the physicochemical properties of the residue change.

# What about a relation between physiochemical properties and the ability to disrupt specific types of secondary structure?

# Hydropathy
model_ss_hydro <- glm(DSSP ~ Hydropathy_diff, data = structure_analysis_physicochemical, family = binomial)
summary(model_ss_hydro)
## 
## Call:
## glm(formula = DSSP ~ Hydropathy_diff, family = binomial, data = structure_analysis_physicochemical)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.02986    0.07753  13.283   <2e-16 ***
## Hydropathy_diff -0.04055    0.18014  -0.225    0.822    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.93  on 861  degrees of freedom
## Residual deviance: 993.87  on 860  degrees of freedom
## AIC: 997.87
## 
## Number of Fisher Scoring iterations: 4
ggplot(structure_analysis_physicochemical, aes(x = DSSP, y = Hydropathy_diff)) +
  geom_violin(alpha = 0.5) +
  geom_jitter(position = position_jitter(w = 0.2, h = 0.25), alpha = 0.5) +
  theme_bw() +
  xlab("Secondary structure class") +
  ylab("Relative hydropathy (mutation:wild type)")

# Side chain volume
model_ss_vol <- glm(DSSP ~ Volume_diff, data = structure_analysis_physicochemical, family = binomial)
summary(model_ss_vol)
## 
## Call:
## glm(formula = DSSP ~ Volume_diff, family = binomial, data = structure_analysis_physicochemical)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.06826    0.08012  13.333   <2e-16 ***
## Volume_diff -0.11272    0.04690  -2.403   0.0163 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.93  on 861  degrees of freedom
## Residual deviance: 988.06  on 860  degrees of freedom
## AIC: 992.06
## 
## Number of Fisher Scoring iterations: 4
ggplot(structure_analysis_physicochemical, aes(x = DSSP, y = Volume_diff)) +
  geom_violin(alpha = 0.5) +
  geom_jitter(position = position_jitter(w = 0.2, h = 0.25), alpha = 0.5) +
  theme_bw() +
  xlab("Secondary structure class") +
  ylab("Relative side chain volume (mutation:wild type)")

Complete alignment between MutaMouse and Genbank reference (shows 15 bp insertion in MutaMouse sequence)

print(alignment, show = "complete")
## 
## MsaDNAMultipleAlignment with 2 rows and 3096 columns
##     aln (1..54)                                            names
## [1] ---ACCATGATTACGGATTCACTGG---------------CCGTCGTTTTACAA V00296.1 E. coli ...
## [2] ATGACCATGATTACGGATTCACTGGAATTCCCGGGGATCCCCGTCGTTTTACAA lacZ
## Con ???ACCATGATTACGGATTCACTGG???????????????CCGTCGTTTTACAA Consensus 
## 
##     aln (55..108)                                          names
## [1] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT V00296.1 E. coli ...
## [2] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT lacZ
## Con CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT Consensus 
## 
##     aln (109..162)                                         names
## [1] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC V00296.1 E. coli ...
## [2] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC lacZ
## Con CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC Consensus 
## 
##     aln (163..216)                                         names
## [1] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA V00296.1 E. coli ...
## [2] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA lacZ
## Con CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA Consensus 
## 
##     aln (217..270)                                         names
## [1] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC V00296.1 E. coli ...
## [2] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC lacZ
## Con GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC Consensus 
## 
##     aln (271..324)                                         names
## [1] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC V00296.1 E. coli ...
## [2] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC lacZ
## Con GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC Consensus 
## 
##     aln (325..378)                                         names
## [1] GTAACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG V00296.1 E. coli ...
## [2] GTGACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG lacZ
## Con GT?ACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG Consensus 
## 
##     aln (379..432)                                         names
## [1] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG V00296.1 E. coli ...
## [2] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG lacZ
## Con GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG Consensus 
## 
##     aln (433..486)                                         names
## [1] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG V00296.1 E. coli ...
## [2] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG lacZ
## Con ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG Consensus 
## 
##     aln (487..540)                                         names
## [1] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC V00296.1 E. coli ...
## [2] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC lacZ
## Con CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC Consensus 
## 
##     aln (541..594)                                         names
## [1] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGTTGGAGT V00296.1 E. coli ...
## [2] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGCTGGAGT lacZ
## Con GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCG?TGGAGT Consensus 
## 
##     aln (595..648)                                         names
## [1] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT V00296.1 E. coli ...
## [2] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT lacZ
## Con GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT Consensus 
## 
##     aln (649..702)                                         names
## [1] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC V00296.1 E. coli ...
## [2] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC lacZ
## Con GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC Consensus 
## 
##     aln (703..756)                                         names
## [1] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG V00296.1 E. coli ...
## [2] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG lacZ
## Con ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG Consensus 
## 
##     aln (757..810)                                         names
## [1] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA V00296.1 E. coli ...
## [2] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA lacZ
## Con TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA Consensus 
## 
##     aln (811..864)                                         names
## [1] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT V00296.1 E. coli ...
## [2] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT lacZ
## Con ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT Consensus 
## 
##     aln (865..918)                                         names
## [1] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG V00296.1 E. coli ...
## [2] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG lacZ
## Con GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG Consensus 
## 
##     aln (919..972)                                         names
## [1] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC V00296.1 E. coli ...
## [2] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC lacZ
## Con TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC Consensus 
## 
##     aln (973..1026)                                        names
## [1] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG V00296.1 E. coli ...
## [2] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG lacZ
## Con GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG Consensus 
## 
##     aln (1027..1080)                                       names
## [1] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT V00296.1 E. coli ...
## [2] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT lacZ
## Con ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT Consensus 
## 
##     aln (1081..1134)                                       names
## [1] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG V00296.1 E. coli ...
## [2] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG lacZ
## Con AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG Consensus 
## 
##     aln (1135..1188)                                       names
## [1] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG V00296.1 E. coli ...
## [2] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG lacZ
## Con GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG Consensus 
## 
##     aln (1189..1242)                                       names
## [1] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT V00296.1 E. coli ...
## [2] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT lacZ
## Con CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT Consensus 
## 
##     aln (1243..1296)                                       names
## [1] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG V00296.1 E. coli ...
## [2] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG lacZ
## Con GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG Consensus 
## 
##     aln (1297..1350)                                       names
## [1] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG V00296.1 E. coli ...
## [2] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG lacZ
## Con ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG Consensus 
## 
##     aln (1351..1404)                                       names
## [1] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA V00296.1 E. coli ...
## [2] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA lacZ
## Con CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA Consensus 
## 
##     aln (1405..1458)                                       names
## [1] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT V00296.1 E. coli ...
## [2] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT lacZ
## Con GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT Consensus 
## 
##     aln (1459..1512)                                       names
## [1] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT V00296.1 E. coli ...
## [2] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT lacZ
## Con TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT Consensus 
## 
##     aln (1513..1566)                                       names
## [1] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG V00296.1 E. coli ...
## [2] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG lacZ
## Con ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG Consensus 
## 
##     aln (1567..1620)                                       names
## [1] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC V00296.1 E. coli ...
## [2] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC lacZ
## Con AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC Consensus 
## 
##     aln (1621..1674)                                       names
## [1] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC V00296.1 E. coli ...
## [2] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC lacZ
## Con CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC Consensus 
## 
##     aln (1675..1728)                                       names
## [1] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG V00296.1 E. coli ...
## [2] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG lacZ
## Con TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG Consensus 
## 
##     aln (1729..1782)                                       names
## [1] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC V00296.1 E. coli ...
## [2] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC lacZ
## Con GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC Consensus 
## 
##     aln (1783..1836)                                       names
## [1] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG V00296.1 E. coli ...
## [2] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG lacZ
## Con GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG Consensus 
## 
##     aln (1837..1890)                                       names
## [1] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG V00296.1 E. coli ...
## [2] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG lacZ
## Con GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG Consensus 
## 
##     aln (1891..1944)                                       names
## [1] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC V00296.1 E. coli ...
## [2] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC lacZ
## Con CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC Consensus 
## 
##     aln (1945..1998)                                       names
## [1] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT V00296.1 E. coli ...
## [2] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT lacZ
## Con CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT Consensus 
## 
##     aln (1999..2052)                                       names
## [1] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG V00296.1 E. coli ...
## [2] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG lacZ
## Con AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG Consensus 
## 
##     aln (2053..2106)                                       names
## [1] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC V00296.1 E. coli ...
## [2] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC lacZ
## Con TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC Consensus 
## 
##     aln (2107..2160)                                       names
## [1] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC V00296.1 E. coli ...
## [2] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC lacZ
## Con ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC Consensus 
## 
##     aln (2161..2214)                                       names
## [1] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC V00296.1 E. coli ...
## [2] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC lacZ
## Con AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC Consensus 
## 
##     aln (2215..2268)                                       names
## [1] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG V00296.1 E. coli ...
## [2] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG lacZ
## Con GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG Consensus 
## 
##     aln (2269..2322)                                       names
## [1] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG V00296.1 E. coli ...
## [2] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG lacZ
## Con CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG Consensus 
## 
##     aln (2323..2376)                                       names
## [1] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT V00296.1 E. coli ...
## [2] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT lacZ
## Con TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT Consensus 
## 
##     aln (2377..2430)                                       names
## [1] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC V00296.1 E. coli ...
## [2] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC lacZ
## Con GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC Consensus 
## 
##     aln (2431..2484)                                       names
## [1] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG V00296.1 E. coli ...
## [2] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG lacZ
## Con GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG Consensus 
## 
##     aln (2485..2538)                                       names
## [1] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC V00296.1 E. coli ...
## [2] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC lacZ
## Con TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC Consensus 
## 
##     aln (2539..2592)                                       names
## [1] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT V00296.1 E. coli ...
## [2] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT lacZ
## Con GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT Consensus 
## 
##     aln (2593..2646)                                       names
## [1] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA V00296.1 E. coli ...
## [2] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA lacZ
## Con GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA Consensus 
## 
##     aln (2647..2700)                                       names
## [1] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG V00296.1 E. coli ...
## [2] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG lacZ
## Con CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG Consensus 
## 
##     aln (2701..2754)                                       names
## [1] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC V00296.1 E. coli ...
## [2] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC lacZ
## Con GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC Consensus 
## 
##     aln (2755..2808)                                       names
## [1] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC V00296.1 E. coli ...
## [2] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC lacZ
## Con GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC Consensus 
## 
##     aln (2809..2862)                                       names
## [1] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA V00296.1 E. coli ...
## [2] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA lacZ
## Con TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA Consensus 
## 
##     aln (2863..2916)                                       names
## [1] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA V00296.1 E. coli ...
## [2] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA lacZ
## Con CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA Consensus 
## 
##     aln (2917..2970)                                       names
## [1] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG V00296.1 E. coli ...
## [2] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG lacZ
## Con CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG Consensus 
## 
##     aln (2971..3024)                                       names
## [1] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA V00296.1 E. coli ...
## [2] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA lacZ
## Con AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA Consensus 
## 
##     aln (3025..3078)                                       names
## [1] GTATCGGCGGAATTCCAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG V00296.1 E. coli ...
## [2] GTATCGGCGGAATTACAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG lacZ
## Con GTATCGGCGGAATT?CAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG Consensus 
## 
##     aln (3079..3096)   names
## [1] TGTCAAAAATAATAATAA V00296.1 E. coli ...
## [2] TGTCAAAAATAATAATAA lacZ
## Con TGTCAAAAATAATAATAA Consensus